Text Processing Script.R 1.84 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# Please determine the required text preprocessing steps using the following flag 
replace_special_chars <- TRUE
remove_duplicate_chars <- TRUE
replace_numbers <- TRUE
convert_to_lower_case <- TRUE
remove_default_stopWords <- TRUE
remove_given_stopWords <- TRUE
stem_words <- TRUE
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 

# Map 1-based optional input ports to variables
dataset1 <- maml.mapInputPort(1) # class: data.frame
# get the label and text columns from the input data set
text_column <- dataset1[["tweet_text"]]
#label_column <- dataset1[["label_column"]]

stopword_list <- NULL
result <- tryCatch({
   dataset2 <- maml.mapInputPort(2) # class: data.frame
   # get the stopword list from the second input data set
   stopword_list <- dataset2[[1]]
}, warning = function(war) {
   # warning handler 
   print(paste("WARNING: ", war))
}, error = function(err) {
   # error handler
   print(paste("ERROR: ", err))
   stopword_list <- NULL
}, finally = {})
 
# Load the R script from the Zip port in ./src/
source("src/text.preprocessing.R");
                            
text_column <- preprocessText(text_column, 
                         replace_special_chars,
                         remove_duplicate_chars,
                         replace_numbers,
                         convert_to_lower_case,
                         remove_default_stopWords,
                         remove_given_stopWords,
                         stem_words, 
                         stopword_list)                   
Sentinment <- dataset1[["sentiment_label"]]
data.set <- data.frame(
                Sentinment,
                text_column,
                stringsAsFactors = FALSE 
                )    

# Select data.frame to be sent to the output Dataset port
maml.mapOutputPort("data.set")