Introduction to dplyr 5.45 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
################################################################################
## This code is property of Data Science Dojo
## Copyright (C) 2017~2018
##
## Objective: Manipulate and visualize data using R
## Please install "dplyr" package: install.packages("dplyr")
## Please install "ggplot2" package: install.packages("ggplot2")
################################################################################
# Script for following along in Introduction to dplyr
# Copy-paste line by line or use the "Run" button in R Studio
#Set the working directory, example: setwd("directory/dataset folder")

install.packages("dplyr")
install.packages("ggplot2")
                 
ningxixu committed
16 17
library(dplyr)
library(ggplot2)
18 19 20 21 22 23 24 25 26 27 28 29
                 
#Reading the dataset from the working directory. 
#Setting string values as characters
#loading the greek characters
wine = read.csv("wine.csv", stringsAsFactors = FALSE, encoding = 'UTF-8')
                 
View (wine)
                 
#Removing columns from dataset
wine = wine[,-c(1,3)]
                 
#Creating a dataset by counting all observations grouped by country and then creating a new variable called count
ningxixu committed
30
wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count))
31 32
                 
#Creating a new variable which contains the top 10 countries
ningxixu committed
33
selected_countries = wine %>% group_by(country) %>% summarize(count=n()) %>% arrange(desc(count)) %>% top_n(10) %>% select(country)
34 35 36 37 38
                 
selected_countries
                 
                 
#Changing the format from data frame to vector as.character referencing the country column
ningxixu committed
39
selected_countries = as.character(selected_countries$country)
40 41 42
class(selected_countries)
                 
#Subsetting data selecting top ten countries and their points from wine
ningxixu committed
43
select_points=wine %>% filter(country %in% selected_countries) %>% select(country, points) %>% arrange(country)
44 45
                 
#Scatterplot with smooth line
ningxixu committed
46
ggplot(wine, aes(points,price)) + geom_point() + geom_smooth()
47 48
                 
#Boxplot between country and points, reordered by median of points. Center aligning the Title of the boxplot
ningxixu committed
49
ggplot(select_points, aes(x=reorder(country,points,median),y=points)) + geom_boxplot(aes(fill=country)) + xlab("Country") + ylab("Points") + ggtitle("Distribution of Top 10 Wine Producing Countries") + theme(plot.title = element_text(hjust = 0.5))
50 51 52 53 54
                 
#Filter by countries that do not appear on the selected_countries dataset
wine %>% filter(!(country %in% selected_countries)) %>% group_by(country) %>%  summarize(median=median(points)) %>% arrange(desc(median))
                 
#Creating a new variable called top using country and points to rate them based on points
ningxixu committed
55
top=wine %>% group_by(country) %>% summarize(median=median(points)) %>% arrange(desc(median))
56 57 58
class(top)
                 
#Changing the format from data frame to vector as.character referencing the country column
ningxixu committed
59
top=as.character(top$country)
60 61 62
top
                 
#Using intersect  function to select the common values in both datasets
ningxixu committed
63
both=intersect(top,selected_countries)
64 65 66 67 68 69 70
both
                 
#Using setdiff to select the non-overlapping values in both datasets
not = setdiff(top, selected_countries)
not
                 
#Creating a subset based on variety using group by and summarize
ningxixu committed
71 72
topwine = wine %>% group_by(variety) %>% summarize(number=n()) %>% arrange(desc(number)) %>% top_n(10)
topwine=as.character(topwine$variety)
73 74 75
topwine
                 
#Plot based on variety and points using group by and summarize
ningxixu committed
76
wine %>% filter(variety %in% topwine) %>% group_by(variety)%>% summarize(median=median(points)) %>% ggplot(aes(reorder(variety,median),median)) + geom_col(aes(fill=variety)) + xlab('Variety') + ylab('Median Point') + scale_x_discrete(labels=abbreviate)
77 78
                 
#Creating top 15 percent cheapest wines with high rating using intersect function
ningxixu committed
79 80 81 82
top15percent=wine %>% arrange(desc(points)) %>% filter(points > quantile(points, prob = 0.85))
cheapest15percent=wine %>% arrange(price) %>% head(nrow(top15percent))
goodvalue = intersect(top15percent,cheapest15percent)
goodvalue
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134

#Feature Engineering

wine = read.csv('wine.csv', stringsAsFactors = FALSE, encoding = 'UTF-8')

save(wine, file = "wine.rda")
load("wine.rda")

#Omiting one column from the wine dataset
wine = wine[,-c(3)]  

View(wine)

#Using transmute and mutate functions to append a new column
wine1 = wine %>% mutate(PPratio = points/price)
wine2 = wine %>% transmute(PPratio = points/price)

#Aggregation by country using group by and summarize
wine  %>% group_by(country) %>% summarize(total = n())

#Missing country values
wine[wine$country == "",]

#Adding missing values in the dataset
wine$country = ifelse(wine$designation == "Askitikos", "Greece", wine$country)
wine$country = ifelse(wine$designation == "Piedra Feliz", "Chile", wine$country)
wine$country = ifelse(wine$variety == "Red Blend", "Turkey", wine$country)

#Combining Datasets

#Creating a new subset by total number of rows by country
newwine = wine  %>% group_by(country) %>% summarize(total = n()) %>% arrange(desc(total))

#Creating subsets with the head of wine and newwine
subset1=head(wine)
subset2=head(newwine)

#Combining two data frames using full join function
full = full_join(subset1, subset2)
full

#Combining two data frames using inner join function
inner = inner_join(subset1, subset2)
inner

#Combining two data frames using left join function
left = left_join(subset1, subset2)
left

#Combining two data frames using right join function
right = right_join(subset1, subset2)
right