Add new auto R web scraping example for Feb 2019 Meetup
Showing
| # Data Science Dojo Meetup: Automated Web Scraping in R | |||
| # Let's start scraping the main head and paragraph text/comments of a single Reddit page | |||
| #install.packages("rvest") | |||
| library(rvest) | |||
| reddit_wbpg <- read_html("https://www.reddit.com/r/politics/comments/a1j9xs/partisan_election_officials_are_inherently_unfair/") | |||
| reddit_wbpg %>% | |||
| html_node("title") %>% | |||
| html_text() | |||
| reddit_wbpg %>% | |||
| html_nodes("p.s90z9tc-10") %>% | |||
| html_text() | |||
| # Let's scrape the time and URL of all latest pages published on Reddit's r/politics | |||
| reddit_political_news <- read_html("https://www.reddit.com/r/politics/new/") | |||
| time <- reddit_political_news %>% | |||
| html_nodes("a._3jOxDPIQ0KaOWpzvSQo-1s") %>% | |||
| html_text() | |||
| time | |||
| urls <- reddit_political_news %>% | |||
| html_nodes("a._3jOxDPIQ0KaOWpzvSQo-1s") %>% | |||
| html_attr("href") | |||
| urls | |||
| # Create a dataframe containing the URLs of the Reddit news pages and their published times | |||
| reddit_newspgs_times <- data.frame(NewsPage=urls, PublishedTime=time) | |||
| #Check the dimensions | |||
| dim(reddit_newspgs_times) | |||
| # Filter dataframe by rows that contain a time published in minutes (i.e. within the hour) | |||
| reddit_recent_data <- reddit_newspgs_times[grep("minute|now", reddit_newspgs_times$PublishedTime),] | |||
| #Check the dimensions (# items will be less if not all pages were published within mins) | |||
| dim(reddit_recent_data) | |||
| # Loop through urls, grab the main head and paragraph text of comments, | |||
| # store in their own vectors, and create a dataframe to get it ready for analysis/modeling | |||
| titles <- c() | |||
| comments <- c() | |||
| for(i in reddit_recent_data$NewsPage){ | |||
| reddit_recent_data <- read_html(i) | |||
| body <- reddit_recent_data %>% | |||
| html_nodes("p.s90z9tc-10") %>% | |||
| html_text() | |||
| comments = append(comments, body) | |||
| reddit_recent_data <- read_html(i) | |||
| title <- reddit_recent_data %>% | |||
| html_node("title") %>% | |||
| html_text() | |||
| titles = append(titles, rep(title,each=length(body))) | |||
| } | |||
| reddit_hourly_data <- data.frame(Headline=titles, Comments=comments) | |||
| dim(reddit_hourly_data) | |||
| head(reddit_hourly_data$Comments) | |||
| # Remove disclaimer comments included in all pages so this doesn't flood the comments and skew results | |||
| disclaimers <- c( | |||
| "As a reminder, this subreddit is for civil discussion.", | |||
| "In general, be courteous to others. Attack ideas, not users. Personal insults, shill or troll accusations, hate speech, any advocating or wishing death/physical harm, and other rule violations can result in a permanent ban.", | |||
| "If you see comments in violation of our rules, please report them.", | |||
| "I am a bot, and this action was performed automatically. Please contact the moderators of this subreddit if you have any questions or concerns." | |||
| ) | |||
| reddit_hourly_data_no_disclaimers <- subset( | |||
| reddit_hourly_data, !(Comments %in% c(disclaimers)) | |||
| ) | |||
| dim(reddit_hourly_data_no_disclaimers) | |||
| head(reddit_hourly_data_no_disclaimers$Comments) | |||
| # Score the overall sentiment of each comment | |||
| # This library scores sentiment by taking into account the whole sentence | |||
| # It takes into account surrounding words of a target word such as 'not happy' | |||
| # which cancels out positive sentiment | |||
| # A negative value means sentiment is more negative than positive | |||
| # A positive values means the sentiment is more positive than negative | |||
| #install.packages('sentimentr') | |||
| library(sentimentr) | |||
| # Comment out this line so it does not cause errors when scheduling to run the script | |||
| #sentiment(reddit_hourly_data_no_disclaimers$Comments) | |||
| # Treat comments as characters, not factors | |||
| # Convert to a format sentiment() function accepts | |||
| reddit_hourly_data_no_disclaimers$Comments <- as.character(reddit_hourly_data_no_disclaimers$Comments) | |||
| sentiment_scores <- sentiment(reddit_hourly_data_no_disclaimers$Comments) | |||
| head(sentiment_scores) | |||
| # Average the scores across all comments | |||
| average_sentiment_score <- sum(sentiment_scores$sentiment)/length(sentiment_scores$sentiment) | |||
| average_sentiment_score | |||
| # Email the results of the analysis | |||
| #install.packages("sendmailR") | |||
| library(sendmailR) | |||
| from <- "<[email protected]>" | |||
| to <- "<[email protected]>" | |||
| subject <- "Hourly Sentiment Score on Current US Political Situation" | |||
| body <- c("On a scale of 1 to -1 people feel: ", average_sentiment_score) | |||
| mailControl <- list(smtpServer="ASPMX.L.GOOGLE.COM") #Use Google for Gmail accounts | |||
| sendmail(from=from,to=to,subject=subject,msg=body,control=mailControl) | |||
| # Schedule this script to run every hour to keep track of the overall sentiment | |||
| # Idea to take this further: Instead of emailing the hourly results, | |||
| # store the average sentiment score in a table every hour to plot it | |||
| # over time or see how changes over time | |||
| \ No newline at end of file | 
Please
register
or
sign in
to comment