From c2e656552ef400955d2460272c5f3ef54e4365f5 Mon Sep 17 00:00:00 2001 From: ningxixu <33641600+ningxixu@users.noreply.github.com> Date: Wed, 24 Jan 2018 08:13:21 -0500 Subject: [PATCH] Rename dplyr1 to data manipulation with dplyr --- data manipulation with dplyr | 28 ++++++++++++++++++++++++++++ dplyr1 | 28 ---------------------------- 2 files changed, 28 insertions(+), 28 deletions(-) create mode 100644 data manipulation with dplyr delete mode 100644 dplyr1 diff --git a/data manipulation with dplyr b/data manipulation with dplyr new file mode 100644 index 0000000..36edbef --- /dev/null +++ b/data manipulation with dplyr @@ -0,0 +1,28 @@ +wine = read.csv('wine.csv', stringsAsFactors = F, encoding = 'UTF-8') +install.packages('dplyr') +install.packages('ggplot2') +library(dplyr) +library(ggplot2) +wine = wine[,-1] +wine = wine %>% select(-c(description)) +wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) + +selected_countries = wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(10) %>% select(country) +selected_countries = as.character(selected_countries$country) +select_points=wine %>% filter(country %in% selected_countries) %>% select(country, points) %>% arrange(country) +ggplot(wine, aes(points,price)) + geom_point() + geom_smooth() +ggplot(select_points, aes(x=reorder(country,points,median),y=points)) + geom_boxplot(aes(fill=country)) + xlab("Country") + ylab("Points") + ggtitle("Distribution of Top 10 Wine Producing Countries") + theme(plot.title = element_text(hjust = 0.5)) +wine %>% filter(!(country %in% selected_countries)) %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median)) +top=wine %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median)) +top=as.character(top$country) +both=intersect(top,selected_countries) + +topwine = wine %>% group_by(variety) %>% summarize(number=n()) %>% arrange(desc(number)) %>% top_n(10) +topwine=as.character(topwine$variety) + +wine %>% filter(variety %in% topwine) %>% group_by(variety)%>% summarize(median=median(points)) %>% ggplot(aes(reorder(variety,median),median)) + geom_col(aes(fill=variety)) + xlab('Variety') + ylab('Median Point') + scale_x_discrete(labels=abbreviate) + +top15percent=wine %>% arrange(desc(points)) %>% filter(points > quantile(points, prob = 0.85)) +cheapest15percent=wine %>% arrange(price) %>% head(nrow(top15percent)) +goodvalue = intersect(top15percent,cheapest15percent) +goodvalue diff --git a/dplyr1 b/dplyr1 deleted file mode 100644 index 36edbef..0000000 --- a/dplyr1 +++ /dev/null @@ -1,28 +0,0 @@ -wine = read.csv('wine.csv', stringsAsFactors = F, encoding = 'UTF-8') -install.packages('dplyr') -install.packages('ggplot2') -library(dplyr) -library(ggplot2) -wine = wine[,-1] -wine = wine %>% select(-c(description)) -wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) - -selected_countries = wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(10) %>% select(country) -selected_countries = as.character(selected_countries$country) -select_points=wine %>% filter(country %in% selected_countries) %>% select(country, points) %>% arrange(country) -ggplot(wine, aes(points,price)) + geom_point() + geom_smooth() -ggplot(select_points, aes(x=reorder(country,points,median),y=points)) + geom_boxplot(aes(fill=country)) + xlab("Country") + ylab("Points") + ggtitle("Distribution of Top 10 Wine Producing Countries") + theme(plot.title = element_text(hjust = 0.5)) -wine %>% filter(!(country %in% selected_countries)) %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median)) -top=wine %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median)) -top=as.character(top$country) -both=intersect(top,selected_countries) - -topwine = wine %>% group_by(variety) %>% summarize(number=n()) %>% arrange(desc(number)) %>% top_n(10) -topwine=as.character(topwine$variety) - -wine %>% filter(variety %in% topwine) %>% group_by(variety)%>% summarize(median=median(points)) %>% ggplot(aes(reorder(variety,median),median)) + geom_col(aes(fill=variety)) + xlab('Variety') + ylab('Median Point') + scale_x_discrete(labels=abbreviate) - -top15percent=wine %>% arrange(desc(points)) %>% filter(points > quantile(points, prob = 0.85)) -cheapest15percent=wine %>% arrange(price) %>% head(nrow(top15percent)) -goodvalue = intersect(top15percent,cheapest15percent) -goodvalue -- libgit2 0.26.0