diff --git a/Introduction to Data Visualization with R and ggplot2/IntroDataVizRAndGgplot2.R b/Introduction to Data Visualization with R and ggplot2/IntroDataVizRAndGgplot2.R deleted file mode 100644 index 4962a15..0000000 --- a/Introduction to Data Visualization with R and ggplot2/IntroDataVizRAndGgplot2.R +++ /dev/null @@ -1,174 +0,0 @@ -# -# Copyright 2017 Data Science Dojo -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -# -# This R source code file corresponds to the Data Science Dojo webinar -# titled "An Introduction to Data Visualization with R and ggplot2" -# - -#install.packages("ggplot2") -library(ggplot2) - - -# Load Titanic titanicing data for analysis. Open in spreadsheet view. -titanic <- read.csv("titanic.csv", stringsAsFactors = FALSE) -View(titanic) - - -# Set up factors. -titanic$Pclass <- as.factor(titanic$Pclass) -titanic$Survived <- as.factor(titanic$Survived) -titanic$Sex <- as.factor(titanic$Sex) -titanic$Embarked <- as.factor(titanic$Embarked) - - -# -# We'll start our visual analysis of the data focusing on questions -# related to survival rates. Specifically, these questions will use -# the factor (i.e., categorical) variables in the data. Factor data -# is very common in the business context and ggplot2 offers many -# powerful features for visualizing factor data. -# - - -# -# First question - What was the survival rate? -# -# As Survived is a factor (i.e., categorical) variable, a bar chart -# is a great visualization to use. -# -ggplot(titanic, aes(x = Survived)) + - geom_bar() - -# If you really want percentages. -prop.table(table(titanic$Survived)) - -# Add some customization for labels and theme. -ggplot(titanic, aes(x = Survived)) + - theme_bw() + - geom_bar() + - labs(y = "Passenger Count", - title = "Titanic Survival Rates") - - -# -# Second question - What was the survival rate by gender? -# -# We can use color to look at two aspects (i.e., dimensions) -# of the data simultaneously. -# -ggplot(titanic, aes(x = Sex, fill = Survived)) + - theme_bw() + - geom_bar() + - labs(y = "Passenger Count", - title = "Titanic Survival Rates by Sex") - - -# -# Third question - What was the survival rate by class of ticket? -# -ggplot(titanic, aes(x = Pclass, fill = Survived)) + - theme_bw() + - geom_bar() + - labs(y = "Passenger Count", - title = "Titanic Survival Rates by Pclass") - - -# -# Fourth question - What was the survival rate by class of ticket -# and gender? -# -# We can leverage facets to further segment the data and enable -# "visual drill-down" into the data. -# -ggplot(titanic, aes(x = Sex, fill = Survived)) + - theme_bw() + - facet_wrap(~ Pclass) + - geom_bar() + - labs(y = "Passenger Count", - title = "Titanic Survival Rates by Pclass and Sex") - - - - -# -# Next, we'll move on to visualizing continuous (i.e., numeric) -# data using ggplot2. We'll explore visualizations of single -# numeric variables (i.e., columns) and also illustrate how -# ggplot2 enables visual drill-down on numeric data. -# - - -# -# Fifth Question - What is the distribution of passenger ages? -# -# The histogram is a staple of visualizing numeric data as it very -# powerfully communicates the distrubtion of a variable (i.e., column). -# -ggplot(titanic, aes(x = Age)) + - theme_bw() + - geom_histogram(binwidth = 5) + - labs(y = "Passenger Count", - x = "Age (binwidth = 5)", - title = "Titanic Age Distribtion") - - -# -# Sixth Question - What are the survival rates by age? -# -ggplot(titanic, aes(x = Age, fill = Survived)) + - theme_bw() + - geom_histogram(binwidth = 5) + - labs(y = "Passenger Count", - x = "Age (binwidth = 5)", - title = "Titanic Survival Rates by Age") - -# Another great visualization for this question is the box-and-whisker -# plot. -ggplot(titanic, aes(x = Survived, y = Age)) + - theme_bw() + - geom_boxplot() + - labs(y = "Age", - x = "Survived", - title = "Titanic Survival Rates by Age") - - -# -# Seventh Question - What is the survival rates by age when segmented -# by gender and class of ticket? -# -# A related visualization to the histogram is a density plot. Think of -# a density plot as a smoothed version of the histogram. Using ggplot2 -# we can use facets to allow for visual drill-down via density plots. -# -ggplot(titanic, aes(x = Age, fill = Survived)) + - theme_bw() + - facet_wrap(Sex ~ Pclass) + - geom_density(alpha = 0.5) + - labs(y = "Age", - x = "Survived", - title = "Titanic Survival Rates by Age, Pclass and Sex") - -# If you prefer histograms, no problem! -ggplot(titanic, aes(x = Age, fill = Survived)) + - theme_bw() + - facet_wrap(Sex ~ Pclass) + - geom_histogram(binwidth = 5) + - labs(y = "Age", - x = "Survived", - title = "Titanic Survival Rates by Age, Pclass and Sex") -