Unverified Commit 4a53e20d by Arham Akheel Committed by GitHub

Code changes

parent 895a1f55
...@@ -15,11 +15,13 @@ install.packages("ggplot2") ...@@ -15,11 +15,13 @@ install.packages("ggplot2")
library(dplyr) library(dplyr)
library(ggplot2) library(ggplot2)
setwd("C:/Users/Arham/Desktop/Files/Introduction to dplyr")
#Reading the dataset from the working directory. #Reading the dataset from the working directory.
#Setting string values as characters #Setting string values as characters
#loading the greek characters #loading the greek characters
wine = read.csv("wine.csv", stringsAsFactors = FALSE, encoding = 'UTF-8') wine = read.csv("wine.csv",
stringsAsFactors = FALSE,
encoding = 'UTF-8')
View (wine) View (wine)
...@@ -27,32 +29,61 @@ View (wine) ...@@ -27,32 +29,61 @@ View (wine)
wine = wine[,-c(1,3)] wine = wine[,-c(1,3)]
#Creating a dataset by counting all observations grouped by country and then creating a new variable called count #Creating a dataset by counting all observations grouped by country and then creating a new variable called count
wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) wine %>%
group_by(country)%>%
summarize(count=n()) %>%
arrange(desc(count))
#Creating a new variable which contains the top 10 countries #Creating a new variable which contains the top 10 countries
selected_countries = wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(10) %>% select(country) selected_countries = wine %>%
group_by(country) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
top_n(10) %>%
select(country)
selected_countries selected_countries
#Changing the format from data frame to vector as.character referencing the country column #Changing the format from data frame to vector as.character referencing the country column
selected_countries = as.character(selected_countries$country) selected_countries = as.character(selected_countries$country)
class(selected_countries) class(selected_countries)
#Subsetting data selecting top ten countries and their points from wine #Subsetting data selecting top ten countries and their points from wine
select_points=wine %>% filter(country %in% selected_countries) %>% select(country, points) %>% arrange(country) select_points=wine %>%
filter(country %in% selected_countries) %>%
select(country, points) %>%
arrange(country)
#Scatterplot with smooth line #Scatterplot with smooth line
ggplot(wine, aes(points,price)) + geom_point() + geom_smooth() ggplot(wine, aes(points,price)) +
geom_point() +
geom_smooth()
#Boxplot between country and points, reordered by median of points. Center aligning the Title of the boxplot #Boxplot between country and points, reordered by median of points. Center aligning the Title of the boxplot
ggplot(select_points, aes(x=reorder(country,points,median),y=points)) + geom_boxplot(aes(fill=country)) + xlab("Country") + ylab("Points") + ggtitle("Distribution of Top 10 Wine Producing Countries") + theme(plot.title = element_text(hjust = 0.5)) ggplot(select_points,
aes(x=reorder(country,points,median),
y=points)) +
geom_boxplot(aes(fill=country)) +
xlab("Country") +
ylab("Points") +
ggtitle("Distribution of Top 10 Wine Producing Countries") +
theme(plot.title = element_text(hjust = 0.5))
#Filter by countries that do not appear on the selected_countries dataset #Filter by countries that do not appear on the selected_countries dataset
wine %>% filter(!(country %in% selected_countries)) %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median)) wine %>%
filter(!(country %in% selected_countries)) %>%
group_by(country) %>%
summarize(median=median(points)) %>%
arrange(desc(median))
#Creating a new variable called top using country and points to rate them based on points #Creating a new variable called top using country and points to rate them based on points
top=wine %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median)) top=wine %>%
group_by(country) %>%
summarize(median=median(points)) %>%
arrange(desc(median))
class(top) class(top)
#Changing the format from data frame to vector as.character referencing the country column #Changing the format from data frame to vector as.character referencing the country column
...@@ -68,22 +99,44 @@ not = setdiff(top, selected_countries) ...@@ -68,22 +99,44 @@ not = setdiff(top, selected_countries)
not not
#Creating a subset based on variety using group by and summarize #Creating a subset based on variety using group by and summarize
topwine = wine %>% group_by(variety) %>% summarize(number=n()) %>% arrange(desc(number)) %>% top_n(10) topwine = wine %>%
group_by(variety) %>%
summarize(number=n()) %>%
arrange(desc(number)) %>%
top_n(10)
topwine=as.character(topwine$variety) topwine=as.character(topwine$variety)
topwine topwine
#Plot based on variety and points using group by and summarize #Plot based on variety and points using group by and summarize
wine %>% filter(variety %in% topwine) %>% group_by(variety)%>% summarize(median=median(points)) %>% ggplot(aes(reorder(variety,median),median)) + geom_col(aes(fill=variety)) + xlab('Variety') + ylab('Median Point') + scale_x_discrete(labels=abbreviate) wine %>%
filter(variety %in% topwine) %>%
group_by(variety)%>%
summarize(median=median(points)) %>%
ggplot(aes(reorder(variety,median),median)) +
geom_col(aes(fill=variety)) +
xlab('Variety') + ylab('Median Point') +
scale_x_discrete(labels=abbreviate)
#Creating top 15 percent cheapest wines with high rating using intersect function #Creating top 15 percent cheapest wines with high rating using intersect function
top15percent=wine %>% arrange(desc(points)) %>% filter(points > quantile(points, prob = 0.85)) top15percent=wine %>%
cheapest15percent=wine %>% arrange(price) %>% head(nrow(top15percent)) arrange(desc(points)) %>%
filter(points > quantile(points, prob = 0.85))
cheapest15percent=wine %>%
arrange(price) %>%
head(nrow(top15percent))
goodvalue = intersect(top15percent,cheapest15percent) goodvalue = intersect(top15percent,cheapest15percent)
goodvalue goodvalue
#Feature Engineering #Feature Engineering
wine = read.csv('wine.csv', stringsAsFactors = FALSE, encoding = 'UTF-8') wine = read.csv('wine.csv',
stringsAsFactors = FALSE,
encoding = 'UTF-8')
save(wine, file = "wine.rda") save(wine, file = "wine.rda")
load("wine.rda") load("wine.rda")
...@@ -94,23 +147,38 @@ wine = wine[,-c(3)] ...@@ -94,23 +147,38 @@ wine = wine[,-c(3)]
View(wine) View(wine)
#Using transmute and mutate functions to append a new column #Using transmute and mutate functions to append a new column
wine1 = wine %>% mutate(PPratio = points/price) wine1 = wine %>%
wine2 = wine %>% transmute(PPratio = points/price) mutate(PPratio = points/price)
wine2 = wine %>%
transmute(PPratio = points/price)
#Aggregation by country using group by and summarize #Aggregation by country using group by and summarize
wine %>% group_by(country) %>% summarize(total = n()) wine %>%
group_by(country) %>%
summarize(total = n())
#Missing country values #Missing country values
wine[wine$country == "",] wine[wine$country == "",]
#Adding missing values in the dataset #Adding missing values in the dataset
wine$country = ifelse(wine$designation == "Askitikos", "Greece", wine$country) wine$country =
wine$country = ifelse(wine$designation == "Piedra Feliz", "Chile", wine$country) ifelse(wine$designation == "Askitikos",
wine$country = ifelse(wine$variety == "Red Blend", "Turkey", wine$country) "Greece", wine$country)
wine$country =
ifelse(wine$designation == "Piedra Feliz",
"Chile", wine$country)
wine$country =
ifelse(wine$variety == "Red Blend",
"Turkey", wine$country)
#Combining Datasets #Combining Datasets
#Creating a new subset by total number of rows by country #Creating a new subset by total number of rows by country
newwine = wine %>% group_by(country) %>% summarize(total = n()) %>% arrange(desc(total)) newwine = wine %>%
group_by(country) %>%
summarize(total = n()) %>%
arrange(desc(total))
#Creating subsets with the head of wine and newwine #Creating subsets with the head of wine and newwine
subset1=head(wine) subset1=head(wine)
...@@ -132,5 +200,3 @@ left ...@@ -132,5 +200,3 @@ left
right = right_join(subset1, subset2) right = right_join(subset1, subset2)
right right
#####End of Code####
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment