User:Cryout/R code

assume.output_name <- paste ("KABG_crowdin_topics_",".csv",sep = "", collapse = NULL) assume.output_name.date <- paste ("KABG_crowdin_topics_",Sys.Date,".csv",sep = "", collapse = NULL) folder.output <- "C:\\Archive\\Data\\Dropbox\\PersWork\\Khan\\Translation\\Phrases\\Output" url <- "https://crowdin.com/project/khanacademy/bg" assume.words_per_hour <- 500
 * 1) USER INPUTS


 * 1) 0. Downlaod the HTML source code of https://crowdin.com/project/khanacademy/bg
 * 2)   setwd(folder.input)
 * 3)   input <- read.csv("KABG_phrases_20150622.html", header=FALSE, sep = ",",stringsAsFactors=FALSE);

if(!require(RCurl)) { install.packages("RCurl", repos="http://cran.at.r-project.org/") library("RCurl") }  #   require(RCurl) || install.packages("RCurl") input.url <- getURL(url,ssl.verifypeer = FALSE) input.url.clean <- gsub('\"','',input.url) input <- data.frame(strsplit(input.url.clean, split = ","),stringsAsFactors=FALSE)  # split used to be "," colnames(input) <- "V1"
 * 1)   0.1. Automate the download

input.clean <- which(input==" CURRENT_LANGUAGE_NAME = Bulgarian") input.clean[2] <- which(grepl("DOWNLOAD_PERMISSIONS", input$V1)) input.clean <- input[(input.clean[1]+1):((input.clean[2])-1),]
 * 1)   0.2. Cut the input to the necessary part

if (sum(!grepl(":",input.clean))!=0) {"Some row exists that doesn't have the charactet  :"}  # QC step input.clean.end <- regexpr(":", input.clean) input.clean.variables <- substr(input.clean,1,input.clean.end-1) input.clean.keep <- !grepl("id-",input.clean.variables) variables.unique <- unique(input.clean.variables[input.clean.keep])
 * 1) 1. Get the full unique list of variables: "something: ..."

analyze <- input.clean[input.clean.keep]
 * 1) 2. prepare table for analysis

analyze.id.indexes <- which(grepl("id:", analyze)) analyze.id.indexes.parent_id <- which(grepl("parent_id", analyze)) analyze.id.indexes.keep <- !(analyze.id.indexes %in% analyze.id.indexes.parent_id) analyze.id.indexes <- analyze.id.indexes[analyze.id.indexes.keep]
 * 1) 3. Get the index of each "id: ..."

transposed.create <- analyze.id.indexes for (counter in 1:length(variables.unique)) {transposed.create <- cbind(transposed.create,rep(".", each=length(analyze.id.indexes))) } for (i in 1:length(analyze.id.indexes)) { transposed.create[i,2] <- analyze[analyze.id.indexes[i]] for (j in 3:(length(variables.unique)+1)) { # If the value of the input cell contains "id:", then blank the value out if (grepl("id:",analyze[analyze.id.indexes[i]+j-2]) & !grepl("parent_id:",analyze[analyze.id.indexes[i]+j-2])) {transposed.create[i,j] <- "."} # If the value to the left is blank, then this should also be blank else { if (transposed.create[i,j-1] == ".") {transposed.create[i,j] <- "."} # If not blank, then we fill the correctly procured data else {transposed.create[i,j] <- analyze[analyze.id.indexes[i]+j-2]} } } } rm(input,input.url,input.url.clean,input.clean,input.clean.end,input.clean.keep,input.clean.variables,   analyze,analyze.id.indexes,analyze.id.indexes.keep,analyze.id.indexes.parent_id,i,j,counter,url)
 * 1) 4. Transpose the values between each 2 "id: ..." inside each instance of "id: ..."
 * 1) Clean-up

transposed.create <- data.frame(transposed.create) colnames(transposed.create) <- c("Index", "id", "name", "title", "parent_id", "type", "as_xliff", "upload_ready", "export_ready",   "total_count", "translated", "approved", "translated_percent", "approved_percent","note") # "editor_url", for (i in 2:ncol(transposed.create)) { transposed.create[,i] <- substr(transposed.create[,i],regexpr(":", transposed.create[,i])+1,100000) } transposed.create <- transform(transposed.create,                 Index = as.numeric(Index),                  id = as.numeric(id),                  parent_id = as.numeric(parent_id),                  upload_ready = as.numeric(upload_ready),                  export_ready = as.numeric(export_ready),                  total_count = as.numeric(total_count),                  translated = as.numeric(translated),                  approved = as.numeric(approved),                  translated_percent = as.numeric(translated_percent),                  approved_percent = as.numeric(approved_percent)        ) transposed <- transposed.create[transposed.create[,"note"]!=".",]
 * 1) 5. Create a data frame whose columns are from (1)
 * 1)   5.1. The var names from inside the values -- for each value, only retain the string after ":"
 * 2)     IMPROVEMENT IDEA: Turn into SAPPLY
 * 1)   5.2. Set columns' formats as numeric
 * 1)   5.3. Split parents and non-parents

parents  <- transposed.create[,c("id","name")];      colnames(parents)[1]   <- "parent_id" parenting <- transposed.create[,c("id","parent_id")]; colnames(parenting) <- c("parent_id","parent_id_next") transposed <- merge(x = transposed, y = parents,  by = "parent_id", all.x = TRUE) transposed <- merge(x = transposed, y = parenting, by = "parent_id", all.x = TRUE)
 * 1) 6. Add Parent_1, Parent_2, Parent_3 -- first from down up and then from up down
 * 2)  6.1. Get the name of parent_id
 * 1)  6.2. Get the second parent_id
 * 1) parent_id_next <- if blank then parent_id

output <- cbind(transposed,               transposed[,"total_count"] - transposed[,"translated"],                floor(log10(transposed[,"total_count"] - transposed[,"translated"] + 0.01)),                (transposed[,"total_count"] - transposed[,"translated"]) / assume.words_per_hour) colnames(output)[(ncol(output)-2):ncol(output)] <- c("remain","magnitude","hours")
 * 1) 9. Post-analysis -- remain, size

assume.priority <- data.frame(cbind(c(0,0,0),c(0,0,0))) assume.priority[,1] <- as.numeric(c(26267,26265,27267)) assume.priority[,2] <- as.character(c("1_high_priority_platform","2_high_priority_content","4_low_priority")) colnames(assume.priority) <- c("parent_id","priority") output <- merge(x = output, y = assume.priority, by = "parent_id", all.x = TRUE) colnames(output)[4] <- "name" colnames(output)[16] <- "parent_name" output$editor_url <- gsub("\\", "", output$editor_url, fixed=TRUE)
 * 1)   9.1. Join an explanation of the priority

setwd(folder.output) write.csv(output, file=assume.output_name, row.names=FALSE, na="", quote=FALSE) write.csv(output, file=assume.output_name.date, row.names=FALSE, na="", quote=FALSE) assume.output.name.sys_date <- paste ("last_run_",Sys.Date,".csv",sep = "", collapse = NULL) write.csv(Sys.Date, file=assume.output.name.sys_date, row.names=FALSE, na="", quote=FALSE)
 * 1)   9.2. Save the output to file


 * 1)   9.3. Upload to Google docs
 * 2)     9.3.1. Can use IMPORTDATA in  Google Sheets: https://support.google.com/docs/answer/3093340?hl=en