Unverified Commit 4a53e20d by Arham Akheel Committed by GitHub

Code changes

parent 895a1f55
################################################################################ ################################################################################
## This code is property of Data Science Dojo ## This code is property of Data Science Dojo
## Copyright (C) 2017~2018 ## Copyright (C) 2017~2018
## ##
## Objective: Manipulate and visualize data using R ## Objective: Manipulate and visualize data using R
## Please install "dplyr" package: install.packages("dplyr") ## Please install "dplyr" package: install.packages("dplyr")
## Please install "ggplot2" package: install.packages("ggplot2") ## Please install "ggplot2" package: install.packages("ggplot2")
################################################################################ ################################################################################
# Script for following along in Introduction to dplyr # Script for following along in Introduction to dplyr
# Copy-paste line by line or use the "Run" button in R Studio # Copy-paste line by line or use the "Run" button in R Studio
#Set the working directory, example: setwd("directory/dataset folder") #Set the working directory, example: setwd("directory/dataset folder")
install.packages("dplyr") install.packages("dplyr")
install.packages("ggplot2") install.packages("ggplot2")
library(dplyr) library(dplyr)
library(ggplot2) library(ggplot2)
setwd("C:/Users/Arham/Desktop/Files/Introduction to dplyr")
#Reading the dataset from the working directory. #Reading the dataset from the working directory.
#Setting string values as characters #Setting string values as characters
#loading the greek characters #loading the greek characters
wine = read.csv("wine.csv", stringsAsFactors = FALSE, encoding = 'UTF-8') wine = read.csv("wine.csv",
stringsAsFactors = FALSE,
View (wine) encoding = 'UTF-8')
#Removing columns from dataset View (wine)
wine = wine[,-c(1,3)]
#Removing columns from dataset
#Creating a dataset by counting all observations grouped by country and then creating a new variable called count wine = wine[,-c(1,3)]
wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count))
#Creating a dataset by counting all observations grouped by country and then creating a new variable called count
#Creating a new variable which contains the top 10 countries wine %>%
selected_countries = wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(10) %>% select(country) group_by(country)%>%
summarize(count=n()) %>%
selected_countries arrange(desc(count))
#Creating a new variable which contains the top 10 countries
#Changing the format from data frame to vector as.character referencing the country column selected_countries = wine %>%
selected_countries = as.character(selected_countries$country) group_by(country) %>%
class(selected_countries) summarize(count=n()) %>%
arrange(desc(count)) %>%
#Subsetting data selecting top ten countries and their points from wine top_n(10) %>%
select_points=wine %>% filter(country %in% selected_countries) %>% select(country, points) %>% arrange(country) select(country)
#Scatterplot with smooth line selected_countries
ggplot(wine, aes(points,price)) + geom_point() + geom_smooth()
#Boxplot between country and points, reordered by median of points. Center aligning the Title of the boxplot #Changing the format from data frame to vector as.character referencing the country column
ggplot(select_points, aes(x=reorder(country,points,median),y=points)) + geom_boxplot(aes(fill=country)) + xlab("Country") + ylab("Points") + ggtitle("Distribution of Top 10 Wine Producing Countries") + theme(plot.title = element_text(hjust = 0.5)) selected_countries = as.character(selected_countries$country)
#Filter by countries that do not appear on the selected_countries dataset class(selected_countries)
wine %>% filter(!(country %in% selected_countries)) %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median))
#Subsetting data selecting top ten countries and their points from wine
#Creating a new variable called top using country and points to rate them based on points select_points=wine %>%
top=wine %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median)) filter(country %in% selected_countries) %>%
class(top) select(country, points) %>%
arrange(country)
#Changing the format from data frame to vector as.character referencing the country column
top=as.character(top$country) #Scatterplot with smooth line
top ggplot(wine, aes(points,price)) +
geom_point() +
#Using intersect function to select the common values in both datasets geom_smooth()
both=intersect(top,selected_countries)
both #Boxplot between country and points, reordered by median of points. Center aligning the Title of the boxplot
ggplot(select_points,
#Using setdiff to select the non-overlapping values in both datasets aes(x=reorder(country,points,median),
not = setdiff(top, selected_countries) y=points)) +
not geom_boxplot(aes(fill=country)) +
xlab("Country") +
#Creating a subset based on variety using group by and summarize ylab("Points") +
topwine = wine %>% group_by(variety) %>% summarize(number=n()) %>% arrange(desc(number)) %>% top_n(10) ggtitle("Distribution of Top 10 Wine Producing Countries") +
topwine=as.character(topwine$variety) theme(plot.title = element_text(hjust = 0.5))
topwine
#Filter by countries that do not appear on the selected_countries dataset
#Plot based on variety and points using group by and summarize wine %>%
wine %>% filter(variety %in% topwine) %>% group_by(variety)%>% summarize(median=median(points)) %>% ggplot(aes(reorder(variety,median),median)) + geom_col(aes(fill=variety)) + xlab('Variety') + ylab('Median Point') + scale_x_discrete(labels=abbreviate) filter(!(country %in% selected_countries)) %>%
group_by(country) %>%
#Creating top 15 percent cheapest wines with high rating using intersect function summarize(median=median(points)) %>%
top15percent=wine %>% arrange(desc(points)) %>% filter(points > quantile(points, prob = 0.85)) arrange(desc(median))
cheapest15percent=wine %>% arrange(price) %>% head(nrow(top15percent))
goodvalue = intersect(top15percent,cheapest15percent) #Creating a new variable called top using country and points to rate them based on points
goodvalue top=wine %>%
group_by(country) %>%
#Feature Engineering summarize(median=median(points)) %>%
arrange(desc(median))
wine = read.csv('wine.csv', stringsAsFactors = FALSE, encoding = 'UTF-8')
class(top)
save(wine, file = "wine.rda")
load("wine.rda") #Changing the format from data frame to vector as.character referencing the country column
top=as.character(top$country)
#Omiting one column from the wine dataset top
wine = wine[,-c(3)]
#Using intersect function to select the common values in both datasets
View(wine) both=intersect(top,selected_countries)
both
#Using transmute and mutate functions to append a new column
wine1 = wine %>% mutate(PPratio = points/price) #Using setdiff to select the non-overlapping values in both datasets
wine2 = wine %>% transmute(PPratio = points/price) not = setdiff(top, selected_countries)
not
#Aggregation by country using group by and summarize
wine %>% group_by(country) %>% summarize(total = n()) #Creating a subset based on variety using group by and summarize
topwine = wine %>%
#Missing country values group_by(variety) %>%
wine[wine$country == "",] summarize(number=n()) %>%
arrange(desc(number)) %>%
#Adding missing values in the dataset top_n(10)
wine$country = ifelse(wine$designation == "Askitikos", "Greece", wine$country)
wine$country = ifelse(wine$designation == "Piedra Feliz", "Chile", wine$country) topwine=as.character(topwine$variety)
wine$country = ifelse(wine$variety == "Red Blend", "Turkey", wine$country)
topwine
#Combining Datasets
#Creating a new subset by total number of rows by country #Plot based on variety and points using group by and summarize
newwine = wine %>% group_by(country) %>% summarize(total = n()) %>% arrange(desc(total)) wine %>%
filter(variety %in% topwine) %>%
#Creating subsets with the head of wine and newwine group_by(variety)%>%
subset1=head(wine) summarize(median=median(points)) %>%
subset2=head(newwine) ggplot(aes(reorder(variety,median),median)) +
geom_col(aes(fill=variety)) +
#Combining two data frames using full join function xlab('Variety') + ylab('Median Point') +
full = full_join(subset1, subset2) scale_x_discrete(labels=abbreviate)
full
#Creating top 15 percent cheapest wines with high rating using intersect function
#Combining two data frames using inner join function top15percent=wine %>%
inner = inner_join(subset1, subset2) arrange(desc(points)) %>%
inner filter(points > quantile(points, prob = 0.85))
#Combining two data frames using left join function cheapest15percent=wine %>%
left = left_join(subset1, subset2) arrange(price) %>%
left head(nrow(top15percent))
#Combining two data frames using right join function goodvalue = intersect(top15percent,cheapest15percent)
right = right_join(subset1, subset2)
right goodvalue
#####End of Code#### #Feature Engineering
wine = read.csv('wine.csv',
stringsAsFactors = FALSE,
encoding = 'UTF-8')
save(wine, file = "wine.rda")
load("wine.rda")
#Omiting one column from the wine dataset
wine = wine[,-c(3)]
View(wine)
#Using transmute and mutate functions to append a new column
wine1 = wine %>%
mutate(PPratio = points/price)
wine2 = wine %>%
transmute(PPratio = points/price)
#Aggregation by country using group by and summarize
wine %>%
group_by(country) %>%
summarize(total = n())
#Missing country values
wine[wine$country == "",]
#Adding missing values in the dataset
wine$country =
ifelse(wine$designation == "Askitikos",
"Greece", wine$country)
wine$country =
ifelse(wine$designation == "Piedra Feliz",
"Chile", wine$country)
wine$country =
ifelse(wine$variety == "Red Blend",
"Turkey", wine$country)
#Combining Datasets
#Creating a new subset by total number of rows by country
newwine = wine %>%
group_by(country) %>%
summarize(total = n()) %>%
arrange(desc(total))
#Creating subsets with the head of wine and newwine
subset1=head(wine)
subset2=head(newwine)
#Combining two data frames using full join function
full = full_join(subset1, subset2)
full
#Combining two data frames using inner join function
inner = inner_join(subset1, subset2)
inner
#Combining two data frames using left join function
left = left_join(subset1, subset2)
left
#Combining two data frames using right join function
right = right_join(subset1, subset2)
right
  • Downloading Kuaishou gives you access to endless creative videos with smooth performance and updated features. With regular code changes, like parent 895a1f55 on master and updates in Introduction to dplyr.R, you can explore and visit.

    Edited by Ella Roland
  • Thanks, any authentic website for Understanding the Mystery of Codes Like pgdl9sv6sq3.

Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment