CodeForPrez.R 6.25 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
#Set working directory

setwd("C:\\Users\\user4\\Documents\\Mithun")



# Create new image and Save it

save.image("./randomforest4.Rdata")

# Install packages( if necessary, install using 'Packages' tab)
install.packages("randomForest")
install.packages("caret")
install.packages("rpart")

# clear all the variables
#rm(list=ls())

train=read.csv("./data//train.csv")
test=read.csv("./data//test.csv")
head(test)
# Add "Survived" column to test, to help combine with train data
test$Survived=NA

# Combine train and test
combi=rbind(train,test)

# Convert names to character
combi$Name<-as.character(combi$Name)

# Split 'Name' to isolate a person's title using strsplit
strsplit(combi$Name[1],split='[,.]')

# test of how strsplit works
strsplit(combi$Name[1],split='[,.]')[[1]]
strsplit(combi$Name[1],split='[,.]')[[1]][2]

# apply function to dataset
# This will isolate title for all rows

combi$Title <- sapply(combi$Name, FUN=function(x){strsplit(x,split='[,.]')[[1]][2]})

# remove empty spaces from the 'Title' field
combi$Title=gsub(' ','',combi$Title)

# Review contents of 'Title' field
table(combi$Title)

# Reduce 'Title' contenst into fewer categories
combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'

# Change Title to a factor
combi$Title <- factor(combi$Title)

# Combine sibling and parent/child variables into FamilySize variable
combi$FamilySize <- combi$SibSp + combi$Parch + 1

# Identifying families by combining last name and family size
# # identify surname
combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
# # combine with family size
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
# Categorize family size less than 2 as small
combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
# Review results
table(combi$FamilyID)
# Further consolidate results( some families may have different last names)
famIDs <- data.frame(table(combi$FamilyID))
famIDs <- famIDs[famIDs$Freq <= 2,]
combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
combi$FamilyID <- factor(combi$FamilyID)

# Splitting this new dataset back into train and test datasets
train <- combi[1:891,]
test <- combi[892:1309,]

# PRESENTATION START
# PRESENTATION START

# The "Age" variable has a few missing values
# To use the randomForest package in R, there should be no missing values
# A quick way of dealing with missing values is to replace them with either the mean or the median of the non-missing values for the variable
# In this example, we are replacing the missing values with a prediction, using decision trees.
library(rpart)
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,data=combi[!is.na(combi$Age),], method="anova")
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])

# Check for other missing variables
## 'Embarked' has two blank variables
## They are identified using the "which" command
which(combi$Embarked == '')
## rows 62 and 830 have the blank values for Embarked
## they are replaced with the mode of all the values for Embarked, which is 'S'
combi$Embarked[c(62,830)] = "S"

## convert 'Embarked' to a factor
combi$Embarked <- factor(combi$Embarked)


##'Fare' has one NA value
which(is.na(combi$Fare))
#Replace with median
combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)

## All missing values are taken care of now

# Random Forests in R can only digest factors with up to 32 levels
# If any factor variable has more than 32 levels, the levels need to be redefined to be <= 32 or the variable needs to be converted into a continuous one
# This example will redefine the levels
str(combi$FamilyID)


##increase the definition of Small from 2 to 3
combi$FamilyID2 <- combi$FamilyID
combi$FamilyID2 <- as.character(combi$FamilyID2)
combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
combi$FamilyID2 <- factor(combi$FamilyID2)

# split the dataset into training and test
train <- combi[1:891,]
test <- combi[892:1309,]


# installing the package
#install.packages('randomForest')
library(randomForest)


#  to ensure reproducible results, use the set.seed function
# this will give you the same results everytime you run the code
# the number inside is not important

set.seed(415)
fit <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + FamilySize +FamilyID2, data=train, importance=TRUE, ntree=2000)
## 'importance=TRUE' allows us to inspect variable importance
##  ntree enables specifying how many trees we want to grow

### ALSO LOOK AT NODESIZE AND SAMPSIZE TO SIMPPLIFY TREE, IN ORDER TO REDUCE COMPLEXITY


# Look at what variables are important
varImpPlot(fit)
## define accuracy and gini

### MeanDecreaseAccuracy : Tells us how much the accuracy decreases without the variable on the Y-axis.
### 'Title' causes the most decrease and is therefore the most predictive in nature

### MeanDecreseGini: Measure how pure terminal nodes are. 
### Again the plot tests results after removing each variable, for decrease in Gini value.
### Variable with the highest value has the highest predictive power

### "Title" variable is top for both measures



# Performance Evaluation

## Confusion Matrix
## The 'fit' object contains several components

names(fit)

## to review the confusion matrix

fit[5]
fit$confusion

##  Confidence Interval forAccuracy
set.seed(121)
library(caret)
confusionMatrix(fit$predicted,train$Survived)

## Area under the curve
###roc(train$Survived,as.integer(fit$predicted),plot = TRUE,smooth=TRUE)


# Tuning the Model




# creating a new data.frame to contain just the predictors necessary and not all the columns
# in the original training dataset
train1=data.frame(Pclass = train$Pclass,Survived =train$Survived, Sex=train$Sex,Age=train$Age,SibSp=train$SibSp,Parch=train$Parch,
                  Fare =train$Fare, Embarked=train$Embarked,Title=train$Title,FamilySize=train$FamilySize,FamilyID2=train$FamilyID2)


# tune to get best value of mtry
set.seed(121)
tunefit=train(as.factor(Survived)~ ., data=train1,method="rf",metric="Accuracy",tuneGrid=data.frame(mtry=c(2,3,4)))
tunefit


# Prediction
prediction=predict(tunefit, newdata=test)
head(prediction)

save.image("./randomforest4.Rdata")