Commit 15a2fbed by Arham Akheel

Migrating meetup, datasets, web_scraping_r,…

Migrating meetup, datasets, web_scraping_r, IntroDataVisualizationWithRAndGgplot2 to tutorials repository
parent 22f44079
Col1
the
and
you
for.
that
have
but
just
with
get
not
day
was
now
this
can
work
all
out
are
http
today
your
too
time
what
got
thank
back
want
from
one
know
will
see
feel
com
think
about
don
realli
had
how
some
there
night
amp
make
watch
need
new
still
they
come
home
when
look
here
off
more
much
quot
twitter
morn
last
tomorrow
then
has
been
wait
sleep
again
her
onli
week
tri
whi
tonight
would
she
thing
way
did
say
follow
veri
bit
though
take
gonna
them
over
should
yeah
bed
even
start
tweet
could
school
hour
peopl
show
twitpic
didn
guy
hey
after
him
next.
weekend
play
down
final
let
cant
use
yes
were
who
soon
never
dont
life
girl
littl
everyon
year
rain
wanna
movi
first
find
where
call
done
sure
head
our
keep
ani
than
alway
his
leav
lot
talk
alreadi
won
man
readi
someth
made
anoth
live
read
eat
becaus
yet
yay
phone
ever
hous
went
song
befor
sound
thought
mayb
summer
someon
tell
give
guess
babi
check
mean
other
end
game
into
hear
listen
later
doesn
noth
while.
actual
happen
same
pic
stuff
birthday
mom
saw
weather
car
two
doe
put
stay
yesterday
world
those
run
also
might
until
gotta
meet
said
around
post
exam
monday
friday
seem
sinc
sunday
job
must
mani
updat
myself
found
haven
video
gone
such
famili
book
most
www
aww
month
their
boy
shop
move
least
dinner
total
woke
may
anyth
lunch
studi
pictur
hair
isn
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Please determine the required text preprocessing steps using the following flag
replace_special_chars <- TRUE
remove_duplicate_chars <- TRUE
replace_numbers <- TRUE
convert_to_lower_case <- TRUE
remove_default_stopWords <- TRUE
remove_given_stopWords <- TRUE
stem_words <- TRUE
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# Map 1-based optional input ports to variables
dataset1 <- maml.mapInputPort(1) # class: data.frame
# get the label and text columns from the input data set
text_column <- dataset1[["tweet_text"]]
#label_column <- dataset1[["label_column"]]
stopword_list <- NULL
result <- tryCatch({
dataset2 <- maml.mapInputPort(2) # class: data.frame
# get the stopword list from the second input data set
stopword_list <- dataset2[[1]]
}, warning = function(war) {
# warning handler
print(paste("WARNING: ", war))
}, error = function(err) {
# error handler
print(paste("ERROR: ", err))
stopword_list <- NULL
}, finally = {})
# Load the R script from the Zip port in ./src/
source("src/text.preprocessing.R");
text_column <- preprocessText(text_column,
replace_special_chars,
remove_duplicate_chars,
replace_numbers,
convert_to_lower_case,
remove_default_stopWords,
remove_given_stopWords,
stem_words,
stopword_list)
Sentinment <- dataset1[["sentiment_label"]]
data.set <- data.frame(
Sentinment,
text_column,
stringsAsFactors = FALSE
)
# Select data.frame to be sent to the output Dataset port
maml.mapOutputPort("data.set")
\ No newline at end of file
# Copyright Microsoft. All rights reserved.
# Licensed for use under the agreement under which you purchased Azure services.
###########################################################################
# preprocessText
###########################################################################
preprocessText <- function(textVector,
replaceSpecialChars,
removeDuplicateChars,
replaceNumbers,
convertToLowerCase,
removeDefaultStopWords,
removeGivenStopWords,
stemWords,
stopword_list = NULL)
{
library("tm")
if(replaceSpecialChars == TRUE) {
print("replace special characters with space ....")
textVector <- gsub("[^0-9a-z]", " ", textVector, ignore.case = TRUE)
}
if(removeDuplicateChars == TRUE) {
print("remove duplicate characters ....")
textVector <- gsub('([[:alpha:]])\\1+', '\\1\\1', textVector)
}
if(replaceNumbers == TRUE) {
print("replace numbers with space ....")
textVector <- gsub("[^a-z]", " ", textVector, ignore.case = TRUE)
}
textVector <- gsub("\\s+", " ", textVector)
textVector <- gsub("^\\s", "", textVector)
textVector <- gsub("\\s$", "", textVector)
print("create corpus ....")
theCorpus <- Corpus(VectorSource(textVector))
if(convertToLowerCase == TRUE) {
print("convert to lower case ....")
theCorpus <- tm_map(theCorpus, content_transformer(tolower))
}
if(removeDefaultStopWords == TRUE){
print("remove default stopwords ....")
theCorpus <- tm_map(theCorpus, removeWords, stopwords("english"))
}
if(removeGivenStopWords == TRUE & !missing(stopword_list) & !is.null(stopword_list)) {
print("remove given stopwords ....")
theCorpus <- tm_map(theCorpus, removeWords, stopword_list)
}
if(stemWords == TRUE) {
print("word stemming ....")
theCorpus <- tm_map(theCorpus, stemDocument, "english")
}
print("stripWhitespace")
#multiple whitespace characters collapsed to a single blank
theCorpus <- tm_map(theCorpus, stripWhitespace)
textVector <- unlist(lapply(theCorpus,
function(x) return(x[1]$content)))
textVector <- gsub("\\s+", " ", textVector)
textVector <- gsub("^\\s", "", textVector)
textVector <- gsub("\\s$", "", textVector)
return(textVector)
}
###########################################################################
# drawWordCloud
###########################################################################
drawWordCloud <- function(textVector, labelVector, maxWords=50)
{
library("wordcloud")
library("tm")
theCorpus <- Corpus(VectorSource(textVector))
label.set <- unique(labelVector)
for(i in 1:length(label.set)){
idx <- which(labelVector == label.set[i])
wordcloud(theCorpus[idx], max.words=maxWords)
}
}
###########################################################################
# create.vocabulary
# background corpus is not necessary to be labeled, it is different from the
# labeled dat used to train the text classifier
# vocabulary creation is unsupervised task
###########################################################################
create.vocabulary <- function(text.column, minWordLen, maxWordLen, minDF, maxDF)
{
if(length(text.column) ==0){
output.voc <- data.frame(row.names = c("df", "idf"))
return(output.voc)
}
#check input parameters
if(minWordLen < 1) {
stop("create.vocabulary error: minWordLen can't be less than < 1")
}
if(maxWordLen < 1 ) {
stop("create.vocabulary error: maxWordLen can't be less than < 1")
}
if(maxWordLen < minWordLen) {
stop("create.vocabulary error: maxWordLen can't be less than minWordLen")
}
if(minDF < 1) {
stop("create.vocabulary error: minDF can't be less than < 1")
}
if(maxDF < 1) {
stop("create.vocabulary error: maxDF can't be less than < 1")
}
if(maxDF < minDF) {
stop("create.vocabulary error: maxDF can't be less than minDF")
}
library("tm")
theCorpus <- Corpus(VectorSource(text.column))
DTM <- DocumentTermMatrix(theCorpus,
control = list(dictionary = NULL,
weighting = weightBin,
bounds = list(global = c(minDF, maxDF)),
WordLengths = c(minWordLen, maxWordLen)))
#nDocs(DTM)
#nTerms(DTM)
terms <- Terms(DTM)
## S3 method for class 'TermDocumentMatrix'
df <- tm_term_score(DTM, terms, FUN = slam::col_sums)
idf <- log(nDocs(DTM)/df)
output.voc <- data.frame(row.names = c("df", "idf"))
output.voc <- rbind(output.voc, df)
output.voc <- rbind(output.voc, idf)
names(output.voc) <- terms
output.voc <- cbind(data.frame(total.docs=length(text.column)), output.voc)
return(output.voc)
}
###########################################################################
# merge.vocabulary
###########################################################################
merge.vocabulary <- function(input1.voc, input2.voc)
{
#check input parameters
if(!is.data.frame(input1.voc)){
stop("merge.vocabulary error: input1.voc must be a data frame")
}
if(!is.data.frame(input2.voc)){
stop("merge.vocabulary error: input2.voc must be a data frame")
}
if(nrow(input1.voc) ==0){
if(nrow(input2.voc) ==0){
output <- data.frame()
return(output)
}else{
return(nput2.voc)
}
}else{
if(nrow(input2.voc) ==0){
return(nput1.voc)
}
}
library("tm")
total.docs <- input1.voc[1,"total.docs"] + input2.voc[1,"total.docs"]
input1.voc <- subset( input1.voc, select = -c(total.docs) )
input2.voc <- subset( input2.voc, select = -c(total.docs) )
input1.dictionary <- names(input1.voc)
input2.dictionary <- names(input2.voc)
output <- NULL
common.dictionary <- intersect(input1.dictionary, input2.dictionary)
if(length(common.dictionary) > 0)
{
df1 <- data.frame(input1.voc[1, common.dictionary])
names(df1) <- common.dictionary
df2 <- data.frame(input2.voc[1, common.dictionary])
names(df2) <- common.dictionary
output <- rbind(df1, df2)
m <- as.matrix(output)
add.dfs <- m[1,] + m[2,]
add.dfs <- t(add.dfs )
output <- data.frame(add.dfs)
names(output) <- common.dictionary
}
left.extra.dictionary <- setdiff(input1.dictionary, input2.dictionary)
if(length(left.extra.dictionary) > 0)
{
df3 <- data.frame(input1.voc[1, left.extra.dictionary])
names(df3) <- left.extra.dictionary
if(!is.null(output)){
output <- cbind(output, df3)
} else{
output <- df3
}
}
right.extra.dictionary <- setdiff(input2.dictionary, input1.dictionary)
if(length(right.extra.dictionary) > 0)
{
df4 <- data.frame(input2.voc[1, right.extra.dictionary])
names(df4) <- right.extra.dictionary
if(!is.null(output)){
output <- cbind(output, df4)
} else{
output <- df4
}
}
output.dictionary <- sort(union(input1.dictionary, input2.dictionary))
output <- output [, output.dictionary]
output <- cbind(data.frame(total.docs=total.docs), output)
return(output)
}
###########################################################################
# calculate.IDF
###########################################################################
calculate.IDF <- function(input.voc, minDF, maxDF)
{
#check input parameters
if(!is.data.frame(input.voc)){
stop("calculate.IDF error: input.voc must be a data frame")
}
if(nrow(input.voc) ==0){
stop("calculate.IDF error: input.voc can not be empty")
}
total.docs <- input.voc[1,"total.docs"]
input.voc <- subset( input.voc, select = -c(total.docs) )
terms <- names(input.voc)
dfs <- as.matrix(input.voc [1,])
kept_ids <- which(dfs >= minDF & dfs <= maxDF)
kept_dfs <- dfs[kept_ids]
idfs <- log(total.docs/kept_dfs)
output.voc <- data.frame(word=terms[kept_ids], df=kept_dfs, idf=idfs)
return(output.voc)
}
###########################################################################
# calculate.TFIDF
###########################################################################
calculate.TFIDF <- function(text.column, input.voc, minWordLen, maxWordLen)
{
#check input parameters
if(minWordLen < 1) {
stop("calculate.TFIDF error: minWordLen can't be less than < 1")
}
if(maxWordLen < 1 ) {
stop("calculate.TFIDF error: maxWordLen can't be less than < 1")
}
if(maxWordLen < minWordLen) {
stop("calculate.TFIDF error: maxWordLen can't be less than minWordLen")
}
if(!is.data.frame(input.voc)){
stop("calculate.TFIDF error: input.voc must be a data frame")
}
library("tm")
if(nrow(input.voc) ==0){
stop("calculate.TFIDF error: input.voc can not be empty")
}
input.dictionary <- as.vector(input.voc$word)
theCorpus <- Corpus(VectorSource(text.column))
DTM <- DocumentTermMatrix(theCorpus,
control = list(dictionary = NULL,
weighting = weightTf,
WordLengths = c(minWordLen, maxWordLen)))
current.dictionary <- Terms(DTM)
common.dictionary <- intersect(input.dictionary, current.dictionary)
DTM <- DTM[, common.dictionary]
#nDocs(DTM)
#nTerms(DTM)
#convert/coarse DTM into data frame
document.term.matrix <- data.frame(doc.id = DTM$i, term.id = DTM$j,
word = common.dictionary[DTM$j], tf = DTM$v)
extra.dictionary <- setdiff(input.dictionary, current.dictionary)
output <- merge(x = document.term.matrix, y = input.voc, by = "word")
output <- output[sort.int(output$doc.id, index.return = TRUE)$ix, ]
output <- cbind(output, tf.idf =output$tf * output$idf)
row.names(output) <- NULL
#replace TF with TF-IDF
DTM$v <- output$tf.idf
#convert "sparse" DocumentTermMatrix into "dense" Matrix
denseMatrix <- as.matrix(DTM)
zeroMatrix <- matrix(data = rep(0,nrow(DTM)*length(extra.dictionary)),
nrow = nrow(DTM),
ncol = length(extra.dictionary), byrow = FALSE,
dimnames = list(Docs(DTM),
extra.dictionary))
denseMatrix <- cbind(denseMatrix, zeroMatrix)
#re-order the columns in the matrix
denseMatrix <- subset(denseMatrix, ,input.dictionary)
#convert Matrix into data frame (dataset)
df <- as.data.frame(denseMatrix)