Added Web Scraping with R code examples

97577125 · Sanjay · 4a53e20d · 97577125 · 97577125
Commit 97577125 authored Dec 19, 2018 by Sanjay
Hide whitespace changes
Inline Side-by-side

Showing with 200 additions and 0 deletions

README.md web_scraping_R-master/README.md +9 -0

r_web_scraping_coded_example_share.R web_scraping_R-master/r_web_scraping_coded_example_share.R +191 -0

No files found.
--- a/web_scraping_R-master/README.md
+++ b/web_scraping_R-master/README.md
+# Automated Web Scraping in R
+Let’s imagine you would like to tap into news sources to analyze the events happening in Bitcoin or any in other topic area that frequently changes by the hour. These events could be analyzed to summarize the key changes or movements in Bitcoin, rate the overall sentiment of the recent discussions, capture important events, and so on. So you need a collection of recent Bitcoin events or news scraped every hour so that you can analyze these events.
+This R script is an example of how to:
+1. Write standard web scraping commands in R
+2. Filter timely data based on time diffs
+3. Analyze or summarize key information in the text
+4. Send an email alert of the results of your analysis. 
--- a/web_scraping_R-master/r_web_scraping_coded_example_share.R
+++ b/web_scraping_R-master/r_web_scraping_coded_example_share.R
+# Automated Web Scraping in R
+# Let's start with a quick demonstration of scraping 
+# the main head and body text of a single web page 
+#install.packages("rvest") #Uncomment this to install this package
+library(rvest)
+marketwatch_wbpg <- read_html(
+  "https://www.marketwatch.com/story/bitcoin-jumps-after-credit-scare-2018-10-15"
+)
+marketwatch_wbpg %>%
+  html_node("title") %>% #See HTML source code for data within this tag
+  html_text()
+marketwatch_wbpg %>%
+  html_nodes("p") %>% #See HTML source code for data within this tag
+  html_text()
+# Let's read in all news on Bitcoin using the
+# Marketwatch source
+marketwatch_bitcoin_articles <- read_html(
+  "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false"
+)
+# Grab all URLs on the page
+urls <- marketwatch_bitcoin_articles %>%
+  html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag
+  html_attr("href")
+urls
+# Grab all datetimes on the page
+datetime <- marketwatch_bitcoin_articles %>%
+  html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag
+  html_text()
+datetime
+# Only first few datetimes are included in this div
+# Grab datetimes for all URLs on the landing page
+# and add these onto to the end of datetime vector
+datetime2 <- marketwatch_bitcoin_articles %>%
+  html_nodes("div.deemphasized span") %>%
+  html_text()
+datetime2
+# Check the index where datetimes for other 
+# URLs start and loop through from that 
+# index to the end of all entries
+for(i in datetime2[13:length(datetime2)]){
+  datetime <- c(datetime, i)
+}
+datetime
+# Convert datetime text to a standard time format
+#install.packages("lubridate") #Uncomment this to install this package
+library(lubridate)
+# First remove periods from datetime, as lubridate 
+# cannot interpret a.m. and p.m. with periods
+datetime_clean <- gsub("\\.","",datetime)
+datetime_parse <- parse_date_time(
+  datetime_clean, "%I:%M %p %m/%d/%Y"
+)
+datetime_parse
+# Convert all ET (Eastern Time) datetime values to 
+# your local time - e.g. PT (Pacific Time)
+datetime_convert <- ymd_hms(
+  datetime_parse, tz = "US/Eastern"
+)
+datetime_convert <- with_tz(
+  datetime_convert, "US/Pacific"
+)
+datetime_convert
+# Create a dataframe containing the urls of the web 
+# pages and their converted datetimes
+marketwatch_webpgs_datetimes <- data.frame(
+  WebPg=urls, DateTime=datetime_convert
+)
+dim(marketwatch_webpgs_datetimes)
+# Take the difference between the your current time
+# and the published datetime of the web pg and add 
+# as a column to the dataframe
+diff_in_hours <- difftime(
+  Sys.time(), marketwatch_webpgs_datetimes$DateTime, units = "hours"
+)
+diff_in_hours
+diff_in_hours <- as.double(diff_in_hours)
+diff_in_hours
+marketwatch_webpgs_datetimes$DiffHours <- diff_in_hours
+head(marketwatch_webpgs_datetimes)
+# Filter rows of the dataframe that contain 
+# DiffHours of less than an hour
+marketwatch_latest_data <- subset(
+  marketwatch_webpgs_datetimes, DiffHours < 1
+)
+marketwatch_latest_data
+# Loop through web pg URLs, read and grab the title 
+# and body text, and store in a dataframe to get 
+# the data ready for analysis
+titles <- c()
+bodies <- c()
+for(i in marketwatch_latest_data$WebPg){
+  marketwatch_latest_wbpg <- read_html(i)
+  title <- marketwatch_latest_wbpg %>%
+    html_node("title") %>%
+    html_text()
+  titles <- append(titles, title)
+  marketwatch_latest_wbpg <- read_html(i)
+  body <- marketwatch_latest_wbpg %>%
+    html_nodes("p") %>%
+    html_text()
+  one_body <- paste(body, collapse=" ")
+  bodies <- append(bodies, one_body)
+}
+marketwatch_latest_data$Title <- titles
+marketwatch_latest_data$Body <- bodies
+names(marketwatch_latest_data)
+marketwatch_latest_data$Title
+marketwatch_latest_data$Body[1]
+# Summarize the body of the text to extract the most 
+# relevant, key info
+# Note: There are other ways to analyze the text:
+# Learn text analytics/natural language processing 
+# and important machine learning concepts: 
+# https://datasciencedojo.com/bootcamp/curriculum/ 
+# Before summarizing the text, we need to clean it 
+# of uneccessary whitespace, new lines, etc 
+#install.packages("stringr") #Uncomment this to install this package
+library(stringr)
+clean_text_bodies <- str_squish(
+  marketwatch_latest_data$Body
+  )
+clean_text_bodies[1]
+# Loop through each body text and grab the top 3 
+# sentences with the most relevant information
+#install.packages("LSAfun") #Uncomment this to install this package
+library(LSAfun)
+summary <- c()
+for(i in clean_text_bodies){
+  top_info <- genericSummary(i,k=3);
+  one_summary <- paste(top_info, collapse=" ")
+  summary <- append(summary, one_summary)
+}
+summary
+marketwatch_latest_data$Summary <- summary
+# Email the results of the summaries, along with 
+# the titles
+#install.packages("sendmailR") #Uncomment this to install this package
+library(sendmailR)
+marketwatch_title_summary <- c()
+for(i in 1:length(marketwatch_latest_data$Summary)){
+  marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Title[i])
+  marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Summary[i])
+}
+marketwatch_title_summary
+from <- "<[email protected]>"
+to <- "<[email protected]>"
+subject <- "Hourly Summary of Bitcoin Events"
+body <- marketwatch_title_summary             
+mailControl <- list(smtpServer="ASPMX.L.GOOGLE.COM") #Use Google for Gmail accounts
+sendmail(from=from,to=to,subject=subject,msg=body,control=mailControl)
+#Schedule this script to run every hour
\ No newline at end of file