Add new auto web scraping R example for Feb 2019 Meetup
Showing
# Data Science Dojo Meetup: Automated Web Scraping in R | |||
# Let's start scraping the main head and paragraph text/comments of a single Reddit page | |||
#install.packages("rvest") | |||
library(rvest) | |||
reddit_wbpg <- read_html("https://www.reddit.com/r/politics/comments/a1j9xs/partisan_election_officials_are_inherently_unfair/") | |||
reddit_wbpg %>% | |||
html_node("title") %>% | |||
html_text() | |||
reddit_wbpg %>% | |||
html_nodes("p.s90z9tc-10") %>% | |||
html_text() | |||
# Let's scrape the time and URL of all latest pages published on Reddit's r/politics | |||
reddit_political_news <- read_html("https://www.reddit.com/r/politics/new/") | |||
time <- reddit_political_news %>% | |||
html_nodes("a._3jOxDPIQ0KaOWpzvSQo-1s") %>% | |||
html_text() | |||
time | |||
urls <- reddit_political_news %>% | |||
html_nodes("a._3jOxDPIQ0KaOWpzvSQo-1s") %>% | |||
html_attr("href") | |||
urls | |||
# Create a dataframe containing the URLs of the Reddit news pages and their published times | |||
reddit_newspgs_times <- data.frame(NewsPage=urls, PublishedTime=time) | |||
#Check the dimensions | |||
dim(reddit_newspgs_times) | |||
# Filter dataframe by rows that contain a time published in minutes (i.e. within the hour) | |||
reddit_recent_data <- reddit_newspgs_times[grep("minute|now", reddit_newspgs_times$PublishedTime),] | |||
#Check the dimensions (# items will be less if not all pages were published within mins) | |||
dim(reddit_recent_data) | |||
# Loop through urls, grab the main head and paragraph text of comments, | |||
# store in their own vectors, and create a dataframe to get it ready for analysis/modeling | |||
titles <- c() | |||
comments <- c() | |||
for(i in reddit_recent_data$NewsPage){ | |||
reddit_recent_data <- read_html(i) | |||
body <- reddit_recent_data %>% | |||
html_nodes("p.s90z9tc-10") %>% | |||
html_text() | |||
comments = append(comments, body) | |||
reddit_recent_data <- read_html(i) | |||
title <- reddit_recent_data %>% | |||
html_node("title") %>% | |||
html_text() | |||
titles = append(titles, rep(title,each=length(body))) | |||
} | |||
reddit_hourly_data <- data.frame(Headline=titles, Comments=comments) | |||
dim(reddit_hourly_data) | |||
head(reddit_hourly_data$Comments) | |||
# Remove disclaimer comments included in all pages so this doesn't flood the comments and skew results | |||
disclaimers <- c( | |||
"As a reminder, this subreddit is for civil discussion.", | |||
"In general, be courteous to others. Attack ideas, not users. Personal insults, shill or troll accusations, hate speech, any advocating or wishing death/physical harm, and other rule violations can result in a permanent ban.", | |||
"If you see comments in violation of our rules, please report them.", | |||
"I am a bot, and this action was performed automatically. Please contact the moderators of this subreddit if you have any questions or concerns." | |||
) | |||
reddit_hourly_data_no_disclaimers <- subset( | |||
reddit_hourly_data, !(Comments %in% c(disclaimers)) | |||
) | |||
dim(reddit_hourly_data_no_disclaimers) | |||
head(reddit_hourly_data_no_disclaimers$Comments) | |||
# Score the overall sentiment of each comment | |||
# This library scores sentiment by taking into account the whole sentence | |||
# It takes into account surrounding words of a target word such as 'not happy' | |||
# which cancels out positive sentiment | |||
# A negative value means sentiment is more negative than positive | |||
# A positive values means the sentiment is more positive than negative | |||
#install.packages('sentimentr') | |||
library(sentimentr) | |||
# Comment out this line so it does not cause errors when scheduling to run the script | |||
#sentiment(reddit_hourly_data_no_disclaimers$Comments) | |||
# Treat comments as characters, not factors | |||
# Convert to a format sentiment() function accepts | |||
reddit_hourly_data_no_disclaimers$Comments <- as.character(reddit_hourly_data_no_disclaimers$Comments) | |||
sentiment_scores <- sentiment(reddit_hourly_data_no_disclaimers$Comments) | |||
head(sentiment_scores) | |||
# Average the scores across all comments | |||
average_sentiment_score <- sum(sentiment_scores$sentiment)/length(sentiment_scores$sentiment) | |||
average_sentiment_score | |||
# Email the results of the analysis | |||
#install.packages("sendmailR") | |||
library(sendmailR) | |||
from <- "<[email protected]>" | |||
to <- "<[email protected]>" | |||
subject <- "Hourly Sentiment Score on Current US Political Situation" | |||
body <- c("On a scale of 1 to -1 people feel: ", average_sentiment_score) | |||
mailControl <- list(smtpServer="ASPMX.L.GOOGLE.COM") #Use Google for Gmail accounts | |||
sendmail(from=from,to=to,subject=subject,msg=body,control=mailControl) | |||
# Schedule this script to run every hour to keep track of the overall sentiment | |||
# Idea to take this further: Instead of emailing the hourly results, | |||
# store the average sentiment score in a table every hour to plot it | |||
# over time or see how changes over time | |||
\ No newline at end of file |
Please
register
or
sign in
to comment