CodeForPrez.R

#Set working directory to your local

setwd("C:\\Users\\user4\\Documents\\Mithun")


# Create new image and Save it

save.image("./randomforest4.Rdata")

# Install packages( if necessary, install using 'Packages' tab)
install.packages("randomForest")
install.packages("caret")
install.packages("rpart")

# clear all the variables
#rm(list=ls())

train=read.csv("./data//train.csv")
test=read.csv("./data//test.csv")
head(test)
# Add "Survived" column to test, to help combine with train data
test$Survived=NA

# Combine train and test
combi=rbind(train,test)

# Convert names to character
combi$Name<-as.character(combi$Name)

# Split 'Name' to isolate a person's title using strsplit
strsplit(combi$Name[1],split='[,.]')

# test of how strsplit works
strsplit(combi$Name[1],split='[,.]')[[1]]
strsplit(combi$Name[1],split='[,.]')[[1]][2]

# apply function to dataset
# This will isolate title for all rows

combi$Title <- sapply(combi$Name, FUN=function(x){strsplit(x,split='[,.]')[[1]][2]})

# remove empty spaces from the 'Title' field
combi$Title=gsub(' ','',combi$Title)

# Review contents of 'Title' field
table(combi$Title)

# Reduce 'Title' contenst into fewer categories
combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'

# Change Title to a factor
combi$Title <- factor(combi$Title)

# Combine sibling and parent/child variables into FamilySize variable
combi$FamilySize <- combi$SibSp + combi$Parch + 1

# Identifying families by combining last name and family size
# # identify surname
combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
# # combine with family size
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
# Categorize family size less than 2 as small
combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
# Review results
table(combi$FamilyID)
# Further consolidate results( some families may have different last names)
famIDs <- data.frame(table(combi$FamilyID))
famIDs <- famIDs[famIDs$Freq <= 2,]
combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
combi$FamilyID <- factor(combi$FamilyID)

# Splitting this new dataset back into train and test datasets
train <- combi[1:891,]
test <- combi[892:1309,]

# PRESENTATION START
# PRESENTATION START

# The "Age" variable has a few missing values
# To use the randomForest package in R, there should be no missing values
# A quick way of dealing with missing values is to replace them with either the mean or the median of the non-missing values for the variable
# In this example, we are replacing the missing values with a prediction, using decision trees.
library(rpart)
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,data=combi[!is.na(combi$Age),], method="anova")
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

# Check for other missing variables
## 'Embarked' has two blank variables
## They are identified using the "which" command
which(combi$Embarked == '')
## rows 62 and 830 have the blank values for Embarked
## they are replaced with the mode of all the values for Embarked, which is 'S'
combi$Embarked[c(62,830)] = "S"

## convert 'Embarked' to a factor
combi$Embarked <- factor(combi$Embarked)


##'Fare' has one NA value
which(is.na(combi$Fare))
#Replace with median
combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

## All missing values are taken care of now

# Random Forests in R can only digest factors with up to 32 levels
# If any factor variable has more than 32 levels, the levels need to be redefined to be <= 32 or the variable needs to be converted into a continuous one
# This example will redefine the levels
str(combi$FamilyID)


##increase the definition of Small from 2 to 3
combi$FamilyID2 <- combi$FamilyID
combi$FamilyID2 <- as.character(combi$FamilyID2)
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
combi$FamilyID2 <- factor(combi$FamilyID2)

# split the dataset into training and test
train <- combi[1:891,]
test <- combi[892:1309,]


# installing the package
#install.packages('randomForest')
library(randomForest)


#  to ensure reproducible results, use the set.seed function
# this will give you the same results everytime you run the code
# the number inside is not important

set.seed(415)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize +FamilyID2, data=train, importance=TRUE, ntree=2000)
## 'importance=TRUE' allows us to inspect variable importance
##  ntree enables specifying how many trees we want to grow

### ALSO LOOK AT NODESIZE AND SAMPSIZE TO SIMPPLIFY TREE, IN ORDER TO REDUCE COMPLEXITY


# Look at what variables are important
varImpPlot(fit)
## define accuracy and gini

### MeanDecreaseAccuracy : Tells us how much the accuracy decreases without the variable on the Y-axis.
### 'Title' causes the most decrease and is therefore the most predictive in nature

### MeanDecreseGini: Measure how pure terminal nodes are. 
### Again the plot tests results after removing each variable, for decrease in Gini value.
### Variable with the highest value has the highest predictive power

### "Title" variable is top for both measures


# Performance Evaluation

## Confusion Matrix
## The 'fit' object contains several components

names(fit)

## to review the confusion matrix

fit[5]
fit$confusion

##  Confidence Interval forAccuracy
set.seed(121)
library(caret)
confusionMatrix(fit$predicted,train$Survived)

## Area under the curve
###roc(train$Survived,as.integer(fit$predicted),plot = TRUE,smooth=TRUE)


# Tuning the Model


# creating a new data.frame to contain just the predictors necessary and not all the columns
# in the original training dataset
train1=data.frame(Pclass = train$Pclass,Survived =train$Survived, Sex=train$Sex,Age=train$Age,SibSp=train$SibSp,Parch=train$Parch,
                  Fare =train$Fare, Embarked=train$Embarked,Title=train$Title,FamilySize=train$FamilySize,FamilyID2=train$FamilyID2)


# tune to get best value of mtry
set.seed(121)
tunefit=train(as.factor(Survived)~ ., data=train1,method="rf",metric="Accuracy",tuneGrid=data.frame(mtry=c(2,3,4)))
tunefit


# Prediction
prediction=predict(tunefit, newdata=test)
head(prediction)

save.image("./randomforest4.Rdata")