# # Copyright 2017 Data Science Dojo # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # This R source code file corresponds to video 1 of the Data Science # Dojo YouTube series "Introduction to Text Analytics with R" located # at the following URL: # <YouTube Video Link Here /> # # Install all required packages. install.packages(c("ggplot2", "e1071", "caret", "quanteda", "irlba", "randomForest")) # Load up the .CSV data and explore in RStudio. spam.raw <- read.csv("spam.csv", stringsAsFactors = FALSE, fileEncoding = "UTF-16") View(spam.raw) # Clean up the data frame and view our handiwork. spam.raw <- spam.raw[, 1:2] names(spam.raw) <- c("Label", "Text") View(spam.raw) # Check data to see if there are missing values. length(which(!complete.cases(spam.raw))) # Convert our class label into a factor. spam.raw$Label <- as.factor(spam.raw$Label) # The first step, as always, is to explore the data. # First, let's take a look at distibution of the class labels (i.e., ham vs. spam). prop.table(table(spam.raw$Label)) # Next up, let's get a feel for the distribution of text lengths of the SMS # messages by adding a new feature for the length of each message. spam.raw$TextLength <- nchar(spam.raw$Text) summary(spam.raw$TextLength) # Visualize distribution with ggplot2, adding segmentation for ham/spam. library(ggplot2) ggplot(spam.raw, aes(x = TextLength, fill = Label)) + theme_bw() + geom_histogram(binwidth = 5) + labs(y = "Text Count", x = "Length of Text", title = "Distribution of Text Lengths with Class Labels")