Unverified Commit 4a53e20d by Arham Akheel Committed by GitHub

Code changes

parent 895a1f55
......@@ -15,11 +15,13 @@ install.packages("ggplot2")
library(dplyr)
library(ggplot2)
setwd("C:/Users/Arham/Desktop/Files/Introduction to dplyr")
#Reading the dataset from the working directory.
#Setting string values as characters
#loading the greek characters
wine = read.csv("wine.csv", stringsAsFactors = FALSE, encoding = 'UTF-8')
wine = read.csv("wine.csv",
stringsAsFactors = FALSE,
encoding = 'UTF-8')
View (wine)
......@@ -27,32 +29,61 @@ View (wine)
wine = wine[,-c(1,3)]
#Creating a dataset by counting all observations grouped by country and then creating a new variable called count
wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count))
wine %>%
group_by(country)%>%
summarize(count=n()) %>%
arrange(desc(count))
#Creating a new variable which contains the top 10 countries
selected_countries = wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(10) %>% select(country)
selected_countries = wine %>%
group_by(country) %>%
summarize(count=n()) %>%
arrange(desc(count)) %>%
top_n(10) %>%
select(country)
selected_countries
#Changing the format from data frame to vector as.character referencing the country column
selected_countries = as.character(selected_countries$country)
class(selected_countries)
#Subsetting data selecting top ten countries and their points from wine
select_points=wine %>% filter(country %in% selected_countries) %>% select(country, points) %>% arrange(country)
select_points=wine %>%
filter(country %in% selected_countries) %>%
select(country, points) %>%
arrange(country)
#Scatterplot with smooth line
ggplot(wine, aes(points,price)) + geom_point() + geom_smooth()
ggplot(wine, aes(points,price)) +
geom_point() +
geom_smooth()
#Boxplot between country and points, reordered by median of points. Center aligning the Title of the boxplot
ggplot(select_points, aes(x=reorder(country,points,median),y=points)) + geom_boxplot(aes(fill=country)) + xlab("Country") + ylab("Points") + ggtitle("Distribution of Top 10 Wine Producing Countries") + theme(plot.title = element_text(hjust = 0.5))
ggplot(select_points,
aes(x=reorder(country,points,median),
y=points)) +
geom_boxplot(aes(fill=country)) +
xlab("Country") +
ylab("Points") +
ggtitle("Distribution of Top 10 Wine Producing Countries") +
theme(plot.title = element_text(hjust = 0.5))
#Filter by countries that do not appear on the selected_countries dataset
wine %>% filter(!(country %in% selected_countries)) %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median))
wine %>%
filter(!(country %in% selected_countries)) %>%
group_by(country) %>%
summarize(median=median(points)) %>%
arrange(desc(median))
#Creating a new variable called top using country and points to rate them based on points
top=wine %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median))
top=wine %>%
group_by(country) %>%
summarize(median=median(points)) %>%
arrange(desc(median))
class(top)
#Changing the format from data frame to vector as.character referencing the country column
......@@ -68,22 +99,44 @@ not = setdiff(top, selected_countries)
not
#Creating a subset based on variety using group by and summarize
topwine = wine %>% group_by(variety) %>% summarize(number=n()) %>% arrange(desc(number)) %>% top_n(10)
topwine = wine %>%
group_by(variety) %>%
summarize(number=n()) %>%
arrange(desc(number)) %>%
top_n(10)
topwine=as.character(topwine$variety)
topwine
#Plot based on variety and points using group by and summarize
wine %>% filter(variety %in% topwine) %>% group_by(variety)%>% summarize(median=median(points)) %>% ggplot(aes(reorder(variety,median),median)) + geom_col(aes(fill=variety)) + xlab('Variety') + ylab('Median Point') + scale_x_discrete(labels=abbreviate)
wine %>%
filter(variety %in% topwine) %>%
group_by(variety)%>%
summarize(median=median(points)) %>%
ggplot(aes(reorder(variety,median),median)) +
geom_col(aes(fill=variety)) +
xlab('Variety') + ylab('Median Point') +
scale_x_discrete(labels=abbreviate)
#Creating top 15 percent cheapest wines with high rating using intersect function
top15percent=wine %>% arrange(desc(points)) %>% filter(points > quantile(points, prob = 0.85))
cheapest15percent=wine %>% arrange(price) %>% head(nrow(top15percent))
top15percent=wine %>%
arrange(desc(points)) %>%
filter(points > quantile(points, prob = 0.85))
cheapest15percent=wine %>%
arrange(price) %>%
head(nrow(top15percent))
goodvalue = intersect(top15percent,cheapest15percent)
goodvalue
#Feature Engineering
wine = read.csv('wine.csv', stringsAsFactors = FALSE, encoding = 'UTF-8')
wine = read.csv('wine.csv',
stringsAsFactors = FALSE,
encoding = 'UTF-8')
save(wine, file = "wine.rda")
load("wine.rda")
......@@ -94,23 +147,38 @@ wine = wine[,-c(3)]
View(wine)
#Using transmute and mutate functions to append a new column
wine1 = wine %>% mutate(PPratio = points/price)
wine2 = wine %>% transmute(PPratio = points/price)
wine1 = wine %>%
mutate(PPratio = points/price)
wine2 = wine %>%
transmute(PPratio = points/price)
#Aggregation by country using group by and summarize
wine %>% group_by(country) %>% summarize(total = n())
wine %>%
group_by(country) %>%
summarize(total = n())
#Missing country values
wine[wine$country == "",]
#Adding missing values in the dataset
wine$country = ifelse(wine$designation == "Askitikos", "Greece", wine$country)
wine$country = ifelse(wine$designation == "Piedra Feliz", "Chile", wine$country)
wine$country = ifelse(wine$variety == "Red Blend", "Turkey", wine$country)
wine$country =
ifelse(wine$designation == "Askitikos",
"Greece", wine$country)
wine$country =
ifelse(wine$designation == "Piedra Feliz",
"Chile", wine$country)
wine$country =
ifelse(wine$variety == "Red Blend",
"Turkey", wine$country)
#Combining Datasets
#Creating a new subset by total number of rows by country
newwine = wine %>% group_by(country) %>% summarize(total = n()) %>% arrange(desc(total))
newwine = wine %>%
group_by(country) %>%
summarize(total = n()) %>%
arrange(desc(total))
#Creating subsets with the head of wine and newwine
subset1=head(wine)
......@@ -132,5 +200,3 @@ left
right = right_join(subset1, subset2)
right
#####End of Code####
  • Downloading Kuaishou gives you access to endless creative videos with smooth performance and updated features. With regular code changes, like parent 895a1f55 on master and updates in Introduction to dplyr.R, you can explore and visit.

    Edited by Ella Roland
  • Thanks, any authentic website for Understanding the Mystery of Codes Like pgdl9sv6sq3.

Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment