titanic.R 2.42 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
#=========================================================================================
#
# File:        titanic.R
# Author:      Dave Langer
# Description: This code illustrates R coding used in the "Introduction to R Programming 
#              for Excel Users" Meetup dated 05/03/2017. More details on 
#              the Meetup are available at:
#
#                 https://www.meetup.com/data-science-dojo/events/239049571/
#
#              The code in this file leverages data from Kaggle's "Titanic: Machine 
#              Learning from Disaster" introductory competition:
#
#                 https://www.kaggle.com/c/titanic
#
# NOTE - This file is provided "As-Is" and no warranty regardings its contents are
#        offered nor implied. USE AT YOUR OWN RISK!
#
#=========================================================================================


# Load up Titanic data into a R data frame (i.e., R's version of an Excel table)
titanic <- read.csv("titanic.csv", header = TRUE)


# Add a new feature to the data frame for SurvivedLabel
titanic$SurvivedLabel <- ifelse(titanic$Survived == 1, 
                                "Survived",
                                "Died")


# Add a new feature (i.e., column) to the data frame for FamilySize
titanic$FamilySize <- 1 + titanic$SibSp + titanic$Parch
View(titanic)


# Look at the data types (i.e., R's version of Excel data formatting for cells)
str(titanic)


# Apply a row filter to the Titanic data frame - return only males
males <- titanic[titanic$Sex == "male",]


# Create summary statistics for male fares
summary(males$Fare)
var(males$Fare)
sd(males$Fare)
sum(males$Fare)
length(males$Fare)


# Ranges work just like in Excel - pick the first 5 rows of data.
first.five <- titanic[1:5,]


# View the first five columns of the first five rows.
View(first.five[, 1:5])


# Use an R package (i.e., the Excel equivalent of an Add-in) to
# create powerful visualizations easy.
#install.packages("ggplot2")
library(ggplot2)
ggplot(titanic, aes(x = FamilySize, fill = SurvivedLabel)) +
  theme_bw() +
  facet_wrap(Sex ~ Pclass) +
  geom_histogram(binwidth = 1)


# Use an R package (i.e., the Excel equivalent of an Add-in) to 
# make building data pivots easy.
#install.packages("dplyr")
library(dplyr)
pivot <- titanic %>%
  group_by(Pclass, Sex, SurvivedLabel) %>%
  summarize(AvgFamilySize = mean(FamilySize),
            PassengerCount = n()) %>%
  arrange(Pclass, Sex, SurvivedLabel)
View(pivot)