Files for webinar/tutorial

08f5fb8e · Arham Akheel · GitHub · ca999c34 · 08f5fb8e · 08f5fb8e
Unverified Commit 08f5fb8e authored 6 years ago by Arham Akheel Committed by GitHub 6 years ago
4 changed files
--- a/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.R
+++ b/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.R
+Copyright 2017 Data Science Dojo
+#    
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+
+#
+# This R source code file corresponds to the Data Science Dojo webinar 
+# titled "An Introduction to Data Visualization with R and ggplot2" 
+#
+setwd("C:/Users/Arham/Desktop/Data Visualization with ggplot2")
+install.packages("ggplot2")
+install.packages("dplyr")
+library(dplyr)
+library(ggplot2)
+
+# Load Titanic data for analysis. Open in spreadsheet view.
+titanic <- read.csv("titanic.csv", stringsAsFactors = FALSE)
+View(titanic)
+
+
+# Set up factors.
+titanic$Pclass <- as.factor(titanic$Pclass)
+titanic$Survived <- as.factor(titanic$Survived)
+titanic$Sex <- as.factor(titanic$Sex)
+titanic$Embarked <- as.factor(titanic$Embarked)
+
+
+#
+# We'll start our visual analysis of the data focusing on questions
+# related to survival rates. Specifically, these questions will use
+# the factor (i.e., categorical) variables in the data. Factor data
+# is very common in the business context and ggplot2 offers many
+# powerful features for visualizing factor data.
+#
+
+
+#
+# First question - What was the survival rate? 
+#
+# As Survived is a factor (i.e., categorical) variable, a bar chart 
+# is a great visualization to use.
+#
+ggplot(titanic, aes(x = Survived)) + 
+  geom_bar()
+
+# If you really want percentages.
+prop.table(table(titanic$Survived))
+
+# Add some customization for labels and theme.
+ggplot(titanic, aes(x = Survived)) + 
+  theme_bw() +
+  geom_bar() +
+  labs(y = "Passenger Count",
+       title = "Titanic Survival Rates")
+
+
+#
+# Second question - What was the survival rate by gender? 
+#
+# We can use color to look at two aspects (i.e., dimensions)
+# of the data simultaneously.
+#
+ggplot(titanic, aes(x = Sex, fill = Survived)) + 
+  theme_bw() +
+  geom_bar() +
+  labs(y = "Passenger Count",
+       title = "Titanic Survival Rates by Sex")
+
+
+#
+# Third question - What was the survival rate by class of ticket? 
+#
+ggplot(titanic, aes(x = Pclass, fill = Survived)) + 
+  theme_bw() +
+  geom_bar() +
+  labs(y = "Passenger Count",
+       title = "Titanic Survival Rates by Pclass")
+
+
+#
+# Fourth question - What was the survival rate by class of ticket
+#                   and gender?
+#
+# We can leverage facets to further segment the data and enable
+# "visual drill-down" into the data.
+#
+ggplot(titanic, aes(x = Sex, fill = Survived)) + 
+  theme_bw() +
+  facet_wrap(~ Pclass) +
+  geom_bar() +
+  labs(y = "Passenger Count",
+       title = "Titanic Survival Rates by Pclass and Sex")
+
+
+
+
+#
+# Next, we'll move on to visualizing continuous (i.e., numeric)
+# data using ggplot2. We'll explore visualizations of single 
+# numeric variables (i.e., columns) and also illustrate how
+# ggplot2 enables visual drill-down on numeric data.
+#
+
+
+#
+# Fifth Question - What is the distribution of passenger ages?
+#
+# The histogram is a staple of visualizing numeric data as it very 
+# powerfully communicates the distrubtion of a variable (i.e., column).
+#
+ggplot(titanic, aes(x = Age)) +
+  theme_bw() +
+  geom_histogram(binwidth = 5) +
+  labs(y = "Passenger Count",
+       x = "Age (binwidth = 5)",
+       title = "Titanic Age Distribtion")
+
+
+#
+# Sixth Question - What are the survival rates by age?
+#
+ggplot(titanic, aes(x = Age, fill = Survived)) +
+  theme_bw() +
+  geom_histogram(binwidth = 5) +
+  labs(y = "Passenger Count",
+       x = "Age (binwidth = 5)",
+       title = "Titanic Survival Rates by Age")
+
+# Another great visualization for this question is the box-and-whisker 
+# plot.
+ggplot(titanic, aes(x = Survived, y = Age)) +
+  theme_bw() +
+  geom_boxplot() +
+  labs(y = "Age",
+       x = "Survived",
+       title = "Titanic Survival Rates by Age")
+
+
+#
+# Seventh Question - What is the survival rates by age when segmented
+#                    by gender and class of ticket?
+#
+# A related visualization to the histogram is a density plot. Think of
+# a density plot as a smoothed version of the histogram. Using ggplot2
+# we can use facets to allow for visual drill-down via density plots.
+#
+ggplot(titanic, aes(x = Age, fill = Survived)) +
+  theme_bw() +
+  facet_wrap(Sex ~ Pclass) +
+  geom_density(alpha = 0.5) +
+  labs(y = "Age",
+       x = "Survived",
+       title = "Titanic Survival Rates by Age, Pclass and Sex")
+
+# If you prefer histograms, no problem!
+ggplot(titanic, aes(x = Age, fill = Survived)) +
+  theme_bw() +
+  facet_wrap(Sex ~ Pclass) +
+  geom_histogram(binwidth = 5) +
+  labs(y = "Age",
+       x = "Survived",
+       title = "Titanic Survival Rates by Age, Pclass and Sex")
+
+
+# Load H1B data for analysis. Open in spreadsheet view.
+h1b <- read.csv("H-1B_FY2018.csv", stringsAsFactors = FALSE, encoding = 'UTF-8')
+View(h1b)
+
+
+# Set up factors.
+h1b$EMPLOYER_NAME <- as.factor(h1b$EMPLOYER_NAME)
+h1b$EMPLOYER_CITY <- as.factor(h1b$EMPLOYER_CITY)
+h1b$EMPLOYER_STATE <- as.factor(h1b$EMPLOYER_STATE)
+h1b$SOC_NAME <- as.factor(h1b$SOC_NAME)
+h1b$WORKSITE_CITY <- as.factor(h1b$WORKSITE_CITY)
+h1b$WORKSITE_STATE <- as.factor(h1b$WORKSITE_STATE)
+h1b$CASE_STATUS <- as.factor(h1b$CASE_STATUS)
+h1b$PW_WAGE_LEVEL<- as.factor(h1b$PW_WAGE_LEVEL)
+h1b$JOB_TITLE<- as.factor(h1b$JOB_TITLE)
+
+h1b$PREVAILING_WAGE <- as.numeric(h1b$PREVAILING_WAGE)
+h1b$PREVAILING_WAGE[is.na(h1b$PREVAILING_WAGE)] <- round(mean(h1b$PREVAILING_WAGE, na.rm = TRUE))
+head(h1b)
+
+#We can use color to look at two aspects (i.e., dimensions)
+# of the data simentiously
+
+ggplot(h1b, aes(x = EMPLOYER_STATE, fill = CASE_STATUS)) + 
+  theme_bw() +
+  geom_bar() + 
+  labs(y = "No. of Applications", x = "Employer State",  
+       title = "Distribution by Employer State")
+
+# Subsetting the data to keep only "CERTIFIED" H1B cases
+certified_h1b <- h1b %>%
+  filter(CASE_STATUS == "CERTIFIED")
+
+#Function to return the top N employers that have the most H1B workers
+top_N_employers <- function(num_emp) {
+  certified_h1b %>%
+    group_by(EMPLOYER_NAME) %>%
+    summarise(num_apps = n()) %>%
+    arrange(desc(num_apps)) %>%
+    slice(1:num_emp)
+}
+
+# Bar plot to show the top 10 employers who filed the most h1b visa applications
+ggplot(top_N_employers(10), 
+       aes(x = reorder(EMPLOYER_NAME, num_apps), y = num_apps)) +
+  geom_bar(stat = "identity", alpha = 0.9, fill = "green", width = 0.7) +
+  coord_flip() +
+  scale_y_continuous(limits = c(0, 11000), breaks = seq(0, 11000)) +
+  geom_text(aes(label = num_apps), hjust = -0.2, size = 2) +
+  ggtitle("Top 10 Employers with most applications") +
+  theme_bw() +
+  labs(x = "Employer Name", y = "No. of Applications")
+
+
+
+# Function to return top N occupations that have the most H1B applicants
+top_N_SOC <- function(num) {
+  certified_h1b %>%
+    filter(!is.na(certified_h1b$SOC_NAME)) %>%
+    group_by(SOC_NAME) %>%
+    summarise(num_apps = n()) %>%
+    arrange(desc(num_apps)) %>%
+    slice(1:num)
+}
+
+# Bar plot to show the top 10 H1B occupations 
+ggplot(top_N_SOC(10), 
+       aes(x = reorder(SOC_NAME, num_apps), y = num_apps)) +
+  geom_bar(stat = "identity", alpha = 0.9, fill = "blue", width = 0.7) +
+  coord_flip() +
+  scale_y_continuous() +
+  geom_text(aes(label = num_apps), hjust = -0.2, size = 2) +
+  ggtitle("Top 10 occupations with most H1B petitions") +
+  theme(plot.title = element_text(size = rel(1)),
+        axis.text.y = element_text(size = rel(0.8))) +
+  labs(x = "SOC Name", y = "No. of Applications") 
+
--- a/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.pdf
+++ b/Introduction to Data Visualization with R and ggplot2/Data Visualization with ggplot2.pdf
--- a/Introduction to Data Visualization with R and ggplot2/H1B Metadata 2018.pdf
+++ b/Introduction to Data Visualization with R and ggplot2/H1B Metadata 2018.pdf
--- a/Introduction to Data Visualization with R and ggplot2/Vislualizations - A thought Starter.jpg
+++ b/Introduction to Data Visualization with R and ggplot2/Vislualizations - A thought Starter.jpg