text.preprocessing.R

# Copyright  Microsoft.  All rights reserved.
# Licensed for use under the agreement under which you purchased Azure services.

###########################################################################
# preprocessText
###########################################################################
preprocessText <- function(textVector, 
                         replaceSpecialChars,
                         removeDuplicateChars,
                         replaceNumbers,
                         convertToLowerCase,
                         removeDefaultStopWords,
                         removeGivenStopWords,
                         stemWords,
                         stopword_list = NULL)
{
  library("tm")
  if(replaceSpecialChars == TRUE) {
    print("replace special characters with space ....")
    textVector <- gsub("[^0-9a-z]", " ", textVector, ignore.case = TRUE)
  } 
  if(removeDuplicateChars == TRUE) {
    print("remove duplicate characters ....")
    textVector <- gsub('([[:alpha:]])\\1+', '\\1\\1', textVector)
  }
  if(replaceNumbers == TRUE) {
    print("replace numbers with space ....")
    textVector <- gsub("[^a-z]", " ", textVector, ignore.case = TRUE)
  }
  textVector <- gsub("\\s+", " ", textVector)
  textVector <- gsub("^\\s", "", textVector)
  textVector <- gsub("\\s$", "", textVector)
  
  print("create corpus ....")
  theCorpus <- Corpus(VectorSource(textVector))
  
  if(convertToLowerCase == TRUE) {
    print("convert to lower case ....")
    theCorpus <- tm_map(theCorpus, content_transformer(tolower))  
  }
  if(removeDefaultStopWords == TRUE){
    print("remove default stopwords ....")
    theCorpus <- tm_map(theCorpus, removeWords, stopwords("english"))
  }
  if(removeGivenStopWords == TRUE & !missing(stopword_list) & !is.null(stopword_list)) {
    print("remove given stopwords ....")
    theCorpus <- tm_map(theCorpus, removeWords, stopword_list)  
  }  
  if(stemWords == TRUE) {
    print("word stemming ....")
    theCorpus <- tm_map(theCorpus, stemDocument, "english")
  }
  print("stripWhitespace") 
  #multiple whitespace characters collapsed to a single blank
  theCorpus <- tm_map(theCorpus, stripWhitespace)
      
  textVector  <- unlist(lapply(theCorpus, 
                                function(x) return(x[1]$content)))
  
  textVector <- gsub("\\s+", " ", textVector)
  textVector <- gsub("^\\s", "", textVector)
  textVector <- gsub("\\s$", "", textVector)
  
  return(textVector)    
}

###########################################################################
# drawWordCloud
###########################################################################
drawWordCloud <- function(textVector, labelVector, maxWords=50) 
{
  library("wordcloud")
  library("tm")
  theCorpus <- Corpus(VectorSource(textVector))
  
  label.set <- unique(labelVector)
  for(i in 1:length(label.set)){
    idx <- which(labelVector == label.set[i])  
    wordcloud(theCorpus[idx], max.words=maxWords)
  }
}
###########################################################################
# create.vocabulary 
# background corpus is not necessary to be labeled, it is different from the
# labeled dat used to train the text classifier
# vocabulary creation is unsupervised task
###########################################################################
create.vocabulary <- function(text.column, minWordLen, maxWordLen, minDF, maxDF)
{  
  if(length(text.column) ==0){
    output.voc <- data.frame(row.names = c("df", "idf"))
    return(output.voc)
  }
  #check input parameters 
  if(minWordLen < 1) {
    stop("create.vocabulary error: minWordLen can't be less than < 1")  
  }
  if(maxWordLen < 1 ) {
    stop("create.vocabulary error: maxWordLen can't be less than < 1")  
  }
  if(maxWordLen < minWordLen) {
    stop("create.vocabulary error: maxWordLen can't be less than minWordLen")  
  }
  if(minDF < 1) {
    stop("create.vocabulary error: minDF can't be less than < 1")  
  }
  if(maxDF < 1) {
    stop("create.vocabulary error: maxDF can't be less than < 1")  
  }
  if(maxDF < minDF) {
    stop("create.vocabulary error: maxDF can't be less than minDF")  
  }
  
  library("tm")
  theCorpus <- Corpus(VectorSource(text.column))
  DTM <- DocumentTermMatrix(theCorpus, 
                            control = list(dictionary = NULL,                                                 
                                           weighting = weightBin,
                                           bounds = list(global = c(minDF, maxDF)),
                                           WordLengths = c(minWordLen, maxWordLen))) 
  #nDocs(DTM) 
  #nTerms(DTM)
  terms  <- Terms(DTM)
  ## S3 method for class 'TermDocumentMatrix'
  df <-  tm_term_score(DTM, terms, FUN = slam::col_sums)
  idf <- log(nDocs(DTM)/df)
  
  output.voc <- data.frame(row.names = c("df", "idf"))
  output.voc <- rbind(output.voc, df)
  output.voc <- rbind(output.voc, idf)
  names(output.voc) <- terms
  
  output.voc <- cbind(data.frame(total.docs=length(text.column)), output.voc)
    
  return(output.voc)
}
###########################################################################
# merge.vocabulary
###########################################################################
merge.vocabulary <- function(input1.voc, input2.voc)
{  
  #check input parameters 
  if(!is.data.frame(input1.voc)){
    stop("merge.vocabulary error: input1.voc must be a data frame")  
  }
  if(!is.data.frame(input2.voc)){
    stop("merge.vocabulary error: input2.voc must be a data frame")  
  }
  
  if(nrow(input1.voc) ==0){
    if(nrow(input2.voc) ==0){
      output <- data.frame()
      return(output)
    }else{
      return(nput2.voc)
    }
  }else{
    if(nrow(input2.voc) ==0){
      return(nput1.voc)
    }
  }
  
  library("tm")  
  total.docs <- input1.voc[1,"total.docs"] + input2.voc[1,"total.docs"]
  input1.voc <- subset( input1.voc, select = -c(total.docs) )
  input2.voc <- subset( input2.voc, select = -c(total.docs) )
      
  input1.dictionary <- names(input1.voc)
  input2.dictionary <- names(input2.voc)
  
  output <- NULL
  common.dictionary <- intersect(input1.dictionary, input2.dictionary)
  if(length(common.dictionary) > 0) 
  {  
    df1 <-  data.frame(input1.voc[1, common.dictionary])
    names(df1) <- common.dictionary
    
    df2 <-  data.frame(input2.voc[1, common.dictionary])
    names(df2) <- common.dictionary
    
    output <- rbind(df1, df2)
    m <- as.matrix(output)
    
    add.dfs <- m[1,] + m[2,]
    add.dfs <- t(add.dfs )
    output <- data.frame(add.dfs) 
    names(output) <- common.dictionary
  }
  
  left.extra.dictionary <- setdiff(input1.dictionary, input2.dictionary)
  if(length(left.extra.dictionary) > 0) 
  {  
    df3 <- data.frame(input1.voc[1, left.extra.dictionary])
    names(df3) <- left.extra.dictionary
    
    if(!is.null(output)){ 
      output <- cbind(output, df3)
    } else{
      output <- df3
    }      
  }
  
  right.extra.dictionary <- setdiff(input2.dictionary, input1.dictionary)
  if(length(right.extra.dictionary) > 0)
  { 
    df4 <- data.frame(input2.voc[1, right.extra.dictionary])
    names(df4) <- right.extra.dictionary
    
    if(!is.null(output)){
      output <- cbind(output, df4)
    } else{
      output <- df4
    }
  }
  output.dictionary <- sort(union(input1.dictionary, input2.dictionary))
  output <- output [, output.dictionary]
  
  output <- cbind(data.frame(total.docs=total.docs), output)
  
  return(output)
}
###########################################################################
# calculate.IDF
###########################################################################
calculate.IDF <- function(input.voc, minDF, maxDF)
{  
  #check input parameters 
  if(!is.data.frame(input.voc)){
    stop("calculate.IDF error: input.voc must be a data frame")  
  }
  if(nrow(input.voc) ==0){
    stop("calculate.IDF error: input.voc can not be empty")  
  }
  total.docs <- input.voc[1,"total.docs"] 
  
  input.voc <- subset( input.voc, select = -c(total.docs) )
  terms <- names(input.voc) 
  dfs <- as.matrix(input.voc [1,])
  kept_ids <- which(dfs >= minDF & dfs <= maxDF)
  kept_dfs <- dfs[kept_ids] 

  idfs <- log(total.docs/kept_dfs)
  
  output.voc <- data.frame(word=terms[kept_ids], df=kept_dfs, idf=idfs)
  
  return(output.voc)
}
###########################################################################
# calculate.TFIDF
###########################################################################
calculate.TFIDF <- function(text.column, input.voc, minWordLen, maxWordLen)
{
  #check input parameters 
  if(minWordLen < 1) {
    stop("calculate.TFIDF error: minWordLen can't be less than < 1")  
  }
  if(maxWordLen < 1 ) {
    stop("calculate.TFIDF error: maxWordLen can't be less than < 1")  
  }
  if(maxWordLen < minWordLen) {
    stop("calculate.TFIDF error: maxWordLen can't be less than minWordLen")  
  }
  if(!is.data.frame(input.voc)){
    stop("calculate.TFIDF error: input.voc must be a data frame")  
  }
  library("tm")  
  if(nrow(input.voc) ==0){
    stop("calculate.TFIDF error: input.voc can not be empty")  
  }
  input.dictionary <- as.vector(input.voc$word)
  
  theCorpus <- Corpus(VectorSource(text.column))
  DTM <- DocumentTermMatrix(theCorpus, 
                            control = list(dictionary = NULL,
                                           weighting = weightTf,                                                 
                                           WordLengths = c(minWordLen, maxWordLen)))  
  
  current.dictionary  <- Terms(DTM)
  common.dictionary <- intersect(input.dictionary, current.dictionary)
  DTM <- DTM[, common.dictionary]
  
  #nDocs(DTM)   
  #nTerms(DTM)
  #convert/coarse DTM into data frame
  document.term.matrix <- data.frame(doc.id = DTM$i, term.id = DTM$j, 
                                     word = common.dictionary[DTM$j], tf = DTM$v)
  
  extra.dictionary <- setdiff(input.dictionary, current.dictionary)
  
  output <- merge(x =  document.term.matrix, y = input.voc, by = "word")
  output <- output[sort.int(output$doc.id, index.return = TRUE)$ix, ]
  output <- cbind(output, tf.idf =output$tf * output$idf)
  row.names(output) <- NULL
  
  #replace TF with TF-IDF  
  DTM$v <- output$tf.idf  
  
  #convert "sparse" DocumentTermMatrix into "dense" Matrix
  denseMatrix <- as.matrix(DTM)
  
  zeroMatrix <- matrix(data =  rep(0,nrow(DTM)*length(extra.dictionary)), 
                       nrow = nrow(DTM), 
                       ncol = length(extra.dictionary), byrow = FALSE,
                       dimnames = list(Docs(DTM),
                                       extra.dictionary))
  
  denseMatrix <- cbind(denseMatrix, zeroMatrix)
  #re-order the columns in the matrix
  denseMatrix <- subset(denseMatrix, ,input.dictionary)
  
  #convert Matrix into data frame (dataset)
  df <- as.data.frame(denseMatrix)    
  
  return(df)
}

###########################################################################
# extract.TF.UsingVocabulary
###########################################################################
extract.TF.UsingVocabulary <- function(text, vocab){  
  library("tm")
  theCorpus <- Corpus(VectorSource(text))
  
  sparseDTM <- DocumentTermMatrix(theCorpus, 
                                   control = list(dictionary = vocab,
                                                  weighting = weightTf
                                                  #weighting = weightTfIdf 
                                                  #weighting = weightBin
                                   ))  
  #return(sparseDTM)
  #convert "sparse" DocumentTermMatrix into "dense" Matrix
  denseDTM <- as.matrix(sparseDTM)
  
  #convert Matrix into data frame
  df <- as.data.frame(denseDTM)  
  
  return(df)
}