# # Copyright 2017 Data Science Dojo # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # This R source code file corresponds to video 2 of the Data Science # Dojo YouTube series "Introduction to Text Analytics with R" located # at the following URL: # https://www.youtube.com/watch?v=Y7385dGRNLM # # Install all required packages. install.packages(c("ggplot2", "e1071", "caret", "quanteda", "irlba", "randomForest")) # Load up the .CSV data and explore in RStudio. spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") View(spam.raw) # Clean up the data frame and view our handiwork. spam.raw <- spam.raw[, 1:2] names(spam.raw) <- c("Label", "Text") View(spam.raw) # Check data to see if there are missing values. length(which(!complete.cases(spam.raw))) # Convert our class label into a factor. spam.raw$Label <- as.factor(spam.raw$Label) # The first step, as always, is to explore the data. # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). prop.table(table(spam.raw$Label)) # Next up, let's get a feel for the distribution of text lengths of the SMS # messages by adding a new feature for the length of each message. spam.raw$TextLength <- nchar(spam.raw$Text) summary(spam.raw$TextLength) # Visualize distribution with ggplot2, adding segmentation for ham/spam. library(ggplot2) ggplot(spam.raw, aes(x = TextLength, fill = Label)) + theme_bw() + geom_histogram(binwidth = 5) + labs(y = "Text Count", x = "Length of Text", title = "Distribution of Text Lengths with Class Labels") # At a minimum we need to split our data into a training set and a # test set. In a true project we would want to use a three-way split # of training, validation, and test. # # As we know that our data has non-trivial class imbalance, we'll # use the mighty caret package to create a randomg train/test split # that ensures the correct ham/spam class label proportions (i.e., # we'll use caret for a random stratified split). library(caret) help(package = "caret") # Use caret to create a 70%/30% stratified split. Set the random # seed for reproducibility. set.seed(32984) indexes <- createDataPartition(spam.raw$Label, times = 1, p = 0.7, list = FALSE) train <- spam.raw[indexes,] test <- spam.raw[-indexes,] # Verify proportions. prop.table(table(train$Label)) prop.table(table(test$Label))