Unverified Commit 08f5fb8e by Arham Akheel Committed by GitHub

Files for webinar/tutorial

parent ca999c34
Copyright 2017 Data Science Dojo
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# This R source code file corresponds to the Data Science Dojo webinar
# titled "An Introduction to Data Visualization with R and ggplot2"
#
setwd("C:/Users/Arham/Desktop/Data Visualization with ggplot2")
install.packages("ggplot2")
install.packages("dplyr")
library(dplyr)
library(ggplot2)
# Load Titanic data for analysis. Open in spreadsheet view.
titanic <- read.csv("titanic.csv", stringsAsFactors = FALSE)
View(titanic)
# Set up factors.
titanic$Pclass <- as.factor(titanic$Pclass)
titanic$Survived <- as.factor(titanic$Survived)
titanic$Sex <- as.factor(titanic$Sex)
titanic$Embarked <- as.factor(titanic$Embarked)
#
# We'll start our visual analysis of the data focusing on questions
# related to survival rates. Specifically, these questions will use
# the factor (i.e., categorical) variables in the data. Factor data
# is very common in the business context and ggplot2 offers many
# powerful features for visualizing factor data.
#
#
# First question - What was the survival rate?
#
# As Survived is a factor (i.e., categorical) variable, a bar chart
# is a great visualization to use.
#
ggplot(titanic, aes(x = Survived)) +
geom_bar()
# If you really want percentages.
prop.table(table(titanic$Survived))
# Add some customization for labels and theme.
ggplot(titanic, aes(x = Survived)) +
theme_bw() +
geom_bar() +
labs(y = "Passenger Count",
title = "Titanic Survival Rates")
#
# Second question - What was the survival rate by gender?
#
# We can use color to look at two aspects (i.e., dimensions)
# of the data simultaneously.
#
ggplot(titanic, aes(x = Sex, fill = Survived)) +
theme_bw() +
geom_bar() +
labs(y = "Passenger Count",
title = "Titanic Survival Rates by Sex")
#
# Third question - What was the survival rate by class of ticket?
#
ggplot(titanic, aes(x = Pclass, fill = Survived)) +
theme_bw() +
geom_bar() +
labs(y = "Passenger Count",
title = "Titanic Survival Rates by Pclass")
#
# Fourth question - What was the survival rate by class of ticket
# and gender?
#
# We can leverage facets to further segment the data and enable
# "visual drill-down" into the data.
#
ggplot(titanic, aes(x = Sex, fill = Survived)) +
theme_bw() +
facet_wrap(~ Pclass) +
geom_bar() +
labs(y = "Passenger Count",
title = "Titanic Survival Rates by Pclass and Sex")
#
# Next, we'll move on to visualizing continuous (i.e., numeric)
# data using ggplot2. We'll explore visualizations of single
# numeric variables (i.e., columns) and also illustrate how
# ggplot2 enables visual drill-down on numeric data.
#
#
# Fifth Question - What is the distribution of passenger ages?
#
# The histogram is a staple of visualizing numeric data as it very
# powerfully communicates the distrubtion of a variable (i.e., column).
#
ggplot(titanic, aes(x = Age)) +
theme_bw() +
geom_histogram(binwidth = 5) +
labs(y = "Passenger Count",
x = "Age (binwidth = 5)",
title = "Titanic Age Distribtion")
#
# Sixth Question - What are the survival rates by age?
#
ggplot(titanic, aes(x = Age, fill = Survived)) +
theme_bw() +
geom_histogram(binwidth = 5) +
labs(y = "Passenger Count",
x = "Age (binwidth = 5)",
title = "Titanic Survival Rates by Age")
# Another great visualization for this question is the box-and-whisker
# plot.
ggplot(titanic, aes(x = Survived, y = Age)) +
theme_bw() +
geom_boxplot() +
labs(y = "Age",
x = "Survived",
title = "Titanic Survival Rates by Age")
#
# Seventh Question - What is the survival rates by age when segmented
# by gender and class of ticket?
#
# A related visualization to the histogram is a density plot. Think of
# a density plot as a smoothed version of the histogram. Using ggplot2
# we can use facets to allow for visual drill-down via density plots.
#
ggplot(titanic, aes(x = Age, fill = Survived)) +
theme_bw() +
facet_wrap(Sex ~ Pclass) +
geom_density(alpha = 0.5) +
labs(y = "Age",
x = "Survived",
title = "Titanic Survival Rates by Age, Pclass and Sex")
# If you prefer histograms, no problem!
ggplot(titanic, aes(x = Age, fill = Survived)) +
theme_bw() +
facet_wrap(Sex ~ Pclass) +
geom_histogram(binwidth = 5) +
labs(y = "Age",
x = "Survived",
title = "Titanic Survival Rates by Age, Pclass and Sex")
# Load H1B data for analysis. Open in spreadsheet view.
h1b <- read.csv("H-1B_FY2018.csv", stringsAsFactors = FALSE, encoding = 'UTF-8')
View(h1b)
# Set up factors.
h1b$EMPLOYER_NAME <- as.factor(h1b$EMPLOYER_NAME)
h1b$EMPLOYER_CITY <- as.factor(h1b$EMPLOYER_CITY)
h1b$EMPLOYER_STATE <- as.factor(h1b$EMPLOYER_STATE)
h1b$SOC_NAME <- as.factor(h1b$SOC_NAME)
h1b$WORKSITE_CITY <- as.factor(h1b$WORKSITE_CITY)
h1b$WORKSITE_STATE <- as.factor(h1b$WORKSITE_STATE)
h1b$CASE_STATUS <- as.factor(h1b$CASE_STATUS)
h1b$PW_WAGE_LEVEL<- as.factor(h1b$PW_WAGE_LEVEL)
h1b$JOB_TITLE<- as.factor(h1b$JOB_TITLE)
h1b$PREVAILING_WAGE <- as.numeric(h1b$PREVAILING_WAGE)
h1b$PREVAILING_WAGE[is.na(h1b$PREVAILING_WAGE)] <- round(mean(h1b$PREVAILING_WAGE, na.rm = TRUE))
head(h1b)
#We can use color to look at two aspects (i.e., dimensions)
# of the data simentiously
ggplot(h1b, aes(x = EMPLOYER_STATE, fill = CASE_STATUS)) +
theme_bw() +
geom_bar() +
labs(y = "No. of Applications", x = "Employer State",
title = "Distribution by Employer State")
# Subsetting the data to keep only "CERTIFIED" H1B cases
certified_h1b <- h1b %>%
filter(CASE_STATUS == "CERTIFIED")
#Function to return the top N employers that have the most H1B workers
top_N_employers <- function(num_emp) {
certified_h1b %>%
group_by(EMPLOYER_NAME) %>%
summarise(num_apps = n()) %>%
arrange(desc(num_apps)) %>%
slice(1:num_emp)
}
# Bar plot to show the top 10 employers who filed the most h1b visa applications
ggplot(top_N_employers(10),
aes(x = reorder(EMPLOYER_NAME, num_apps), y = num_apps)) +
geom_bar(stat = "identity", alpha = 0.9, fill = "green", width = 0.7) +
coord_flip() +
scale_y_continuous(limits = c(0, 11000), breaks = seq(0, 11000)) +
geom_text(aes(label = num_apps), hjust = -0.2, size = 2) +
ggtitle("Top 10 Employers with most applications") +
theme_bw() +
labs(x = "Employer Name", y = "No. of Applications")
# Function to return top N occupations that have the most H1B applicants
top_N_SOC <- function(num) {
certified_h1b %>%
filter(!is.na(certified_h1b$SOC_NAME)) %>%
group_by(SOC_NAME) %>%
summarise(num_apps = n()) %>%
arrange(desc(num_apps)) %>%
slice(1:num)
}
# Bar plot to show the top 10 H1B occupations
ggplot(top_N_SOC(10),
aes(x = reorder(SOC_NAME, num_apps), y = num_apps)) +
geom_bar(stat = "identity", alpha = 0.9, fill = "blue", width = 0.7) +
coord_flip() +
scale_y_continuous() +
geom_text(aes(label = num_apps), hjust = -0.2, size = 2) +
ggtitle("Top 10 occupations with most H1B petitions") +
theme(plot.title = element_text(size = rel(1)),
axis.text.y = element_text(size = rel(0.8))) +
labs(x = "SOC Name", y = "No. of Applications")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment