From 08f5fb8e29e602955efa8b429f4299f6426cf1c9 Mon Sep 17 00:00:00 2001 From: Arham Akheel <31144153+arhamakheel@users.noreply.github.com> Date: Tue, 19 Jun 2018 15:57:32 -0700 Subject: [PATCH] Files for webinar/tutorial --- Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.R | 252 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.pdf | Bin 0 -> 437161 bytes Introduction to Data Visualization with R and ggplot2/H1B Metadata 2018.pdf | Bin 0 -> 179467 bytes Introduction to Data Visualization with R and ggplot2/Vislualizations - A thought Starter.jpg | Bin 0 -> 497862 bytes 4 files changed, 252 insertions(+) create mode 100644 Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.R create mode 100644 Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.pdf create mode 100644 Introduction to Data Visualization with R and ggplot2/H1B Metadata 2018.pdf create mode 100644 Introduction to Data Visualization with R and ggplot2/Vislualizations - A thought Starter.jpg diff --git a/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.R b/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.R new file mode 100644 index 0000000..3d6c92f --- /dev/null +++ b/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.R @@ -0,0 +1,252 @@ +Copyright 2017 Data Science Dojo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# +# This R source code file corresponds to the Data Science Dojo webinar +# titled "An Introduction to Data Visualization with R and ggplot2" +# +setwd("C:/Users/Arham/Desktop/Data Visualization with ggplot2") +install.packages("ggplot2") +install.packages("dplyr") +library(dplyr) +library(ggplot2) + +# Load Titanic data for analysis. Open in spreadsheet view. +titanic <- read.csv("titanic.csv", stringsAsFactors = FALSE) +View(titanic) + + +# Set up factors. +titanic$Pclass <- as.factor(titanic$Pclass) +titanic$Survived <- as.factor(titanic$Survived) +titanic$Sex <- as.factor(titanic$Sex) +titanic$Embarked <- as.factor(titanic$Embarked) + + +# +# We'll start our visual analysis of the data focusing on questions +# related to survival rates. Specifically, these questions will use +# the factor (i.e., categorical) variables in the data. Factor data +# is very common in the business context and ggplot2 offers many +# powerful features for visualizing factor data. +# + + +# +# First question - What was the survival rate? +# +# As Survived is a factor (i.e., categorical) variable, a bar chart +# is a great visualization to use. +# +ggplot(titanic, aes(x = Survived)) + + geom_bar() + +# If you really want percentages. +prop.table(table(titanic$Survived)) + +# Add some customization for labels and theme. +ggplot(titanic, aes(x = Survived)) + + theme_bw() + + geom_bar() + + labs(y = "Passenger Count", + title = "Titanic Survival Rates") + + +# +# Second question - What was the survival rate by gender? +# +# We can use color to look at two aspects (i.e., dimensions) +# of the data simultaneously. +# +ggplot(titanic, aes(x = Sex, fill = Survived)) + + theme_bw() + + geom_bar() + + labs(y = "Passenger Count", + title = "Titanic Survival Rates by Sex") + + +# +# Third question - What was the survival rate by class of ticket? +# +ggplot(titanic, aes(x = Pclass, fill = Survived)) + + theme_bw() + + geom_bar() + + labs(y = "Passenger Count", + title = "Titanic Survival Rates by Pclass") + + +# +# Fourth question - What was the survival rate by class of ticket +# and gender? +# +# We can leverage facets to further segment the data and enable +# "visual drill-down" into the data. +# +ggplot(titanic, aes(x = Sex, fill = Survived)) + + theme_bw() + + facet_wrap(~ Pclass) + + geom_bar() + + labs(y = "Passenger Count", + title = "Titanic Survival Rates by Pclass and Sex") + + + + +# +# Next, we'll move on to visualizing continuous (i.e., numeric) +# data using ggplot2. We'll explore visualizations of single +# numeric variables (i.e., columns) and also illustrate how +# ggplot2 enables visual drill-down on numeric data. +# + + +# +# Fifth Question - What is the distribution of passenger ages? +# +# The histogram is a staple of visualizing numeric data as it very +# powerfully communicates the distrubtion of a variable (i.e., column). +# +ggplot(titanic, aes(x = Age)) + + theme_bw() + + geom_histogram(binwidth = 5) + + labs(y = "Passenger Count", + x = "Age (binwidth = 5)", + title = "Titanic Age Distribtion") + + +# +# Sixth Question - What are the survival rates by age? +# +ggplot(titanic, aes(x = Age, fill = Survived)) + + theme_bw() + + geom_histogram(binwidth = 5) + + labs(y = "Passenger Count", + x = "Age (binwidth = 5)", + title = "Titanic Survival Rates by Age") + +# Another great visualization for this question is the box-and-whisker +# plot. +ggplot(titanic, aes(x = Survived, y = Age)) + + theme_bw() + + geom_boxplot() + + labs(y = "Age", + x = "Survived", + title = "Titanic Survival Rates by Age") + + +# +# Seventh Question - What is the survival rates by age when segmented +# by gender and class of ticket? +# +# A related visualization to the histogram is a density plot. Think of +# a density plot as a smoothed version of the histogram. Using ggplot2 +# we can use facets to allow for visual drill-down via density plots. +# +ggplot(titanic, aes(x = Age, fill = Survived)) + + theme_bw() + + facet_wrap(Sex ~ Pclass) + + geom_density(alpha = 0.5) + + labs(y = "Age", + x = "Survived", + title = "Titanic Survival Rates by Age, Pclass and Sex") + +# If you prefer histograms, no problem! +ggplot(titanic, aes(x = Age, fill = Survived)) + + theme_bw() + + facet_wrap(Sex ~ Pclass) + + geom_histogram(binwidth = 5) + + labs(y = "Age", + x = "Survived", + title = "Titanic Survival Rates by Age, Pclass and Sex") + + +# Load H1B data for analysis. Open in spreadsheet view. +h1b <- read.csv("H-1B_FY2018.csv", stringsAsFactors = FALSE, encoding = 'UTF-8') +View(h1b) + + +# Set up factors. +h1b$EMPLOYER_NAME <- as.factor(h1b$EMPLOYER_NAME) +h1b$EMPLOYER_CITY <- as.factor(h1b$EMPLOYER_CITY) +h1b$EMPLOYER_STATE <- as.factor(h1b$EMPLOYER_STATE) +h1b$SOC_NAME <- as.factor(h1b$SOC_NAME) +h1b$WORKSITE_CITY <- as.factor(h1b$WORKSITE_CITY) +h1b$WORKSITE_STATE <- as.factor(h1b$WORKSITE_STATE) +h1b$CASE_STATUS <- as.factor(h1b$CASE_STATUS) +h1b$PW_WAGE_LEVEL<- as.factor(h1b$PW_WAGE_LEVEL) +h1b$JOB_TITLE<- as.factor(h1b$JOB_TITLE) + +h1b$PREVAILING_WAGE <- as.numeric(h1b$PREVAILING_WAGE) +h1b$PREVAILING_WAGE[is.na(h1b$PREVAILING_WAGE)] <- round(mean(h1b$PREVAILING_WAGE, na.rm = TRUE)) +head(h1b) + +#We can use color to look at two aspects (i.e., dimensions) +# of the data simentiously + +ggplot(h1b, aes(x = EMPLOYER_STATE, fill = CASE_STATUS)) + + theme_bw() + + geom_bar() + + labs(y = "No. of Applications", x = "Employer State", + title = "Distribution by Employer State") + +# Subsetting the data to keep only "CERTIFIED" H1B cases +certified_h1b <- h1b %>% + filter(CASE_STATUS == "CERTIFIED") + +#Function to return the top N employers that have the most H1B workers +top_N_employers <- function(num_emp) { + certified_h1b %>% + group_by(EMPLOYER_NAME) %>% + summarise(num_apps = n()) %>% + arrange(desc(num_apps)) %>% + slice(1:num_emp) +} + +# Bar plot to show the top 10 employers who filed the most h1b visa applications +ggplot(top_N_employers(10), + aes(x = reorder(EMPLOYER_NAME, num_apps), y = num_apps)) + + geom_bar(stat = "identity", alpha = 0.9, fill = "green", width = 0.7) + + coord_flip() + + scale_y_continuous(limits = c(0, 11000), breaks = seq(0, 11000)) + + geom_text(aes(label = num_apps), hjust = -0.2, size = 2) + + ggtitle("Top 10 Employers with most applications") + + theme_bw() + + labs(x = "Employer Name", y = "No. of Applications") + + + +# Function to return top N occupations that have the most H1B applicants +top_N_SOC <- function(num) { + certified_h1b %>% + filter(!is.na(certified_h1b$SOC_NAME)) %>% + group_by(SOC_NAME) %>% + summarise(num_apps = n()) %>% + arrange(desc(num_apps)) %>% + slice(1:num) +} + +# Bar plot to show the top 10 H1B occupations +ggplot(top_N_SOC(10), + aes(x = reorder(SOC_NAME, num_apps), y = num_apps)) + + geom_bar(stat = "identity", alpha = 0.9, fill = "blue", width = 0.7) + + coord_flip() + + scale_y_continuous() + + geom_text(aes(label = num_apps), hjust = -0.2, size = 2) + + ggtitle("Top 10 occupations with most H1B petitions") + + theme(plot.title = element_text(size = rel(1)), + axis.text.y = element_text(size = rel(0.8))) + + labs(x = "SOC Name", y = "No. of Applications") + diff --git a/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.pdf b/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.pdf new file mode 100644 index 0000000..e3908b2 Binary files /dev/null and b/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.pdf differ diff --git a/Introduction to Data Visualization with R and ggplot2/H1B Metadata 2018.pdf b/Introduction to Data Visualization with R and ggplot2/H1B Metadata 2018.pdf new file mode 100644 index 0000000..a07a17e Binary files /dev/null and b/Introduction to Data Visualization with R and ggplot2/H1B Metadata 2018.pdf differ diff --git a/Introduction to Data Visualization with R and ggplot2/Vislualizations - A thought Starter.jpg b/Introduction to Data Visualization with R and ggplot2/Vislualizations - A thought Starter.jpg new file mode 100644 index 0000000..1bdcc98 Binary files /dev/null and b/Introduction to Data Visualization with R and ggplot2/Vislualizations - A thought Starter.jpg differ -- libgit2 0.26.0