r_web_scraping_coded_example_share.R

# Automated Web Scraping in R

# Let's start with a quick demonstration of scraping 
# the main head and body text of a single web page 
#install.packages("rvest") #Uncomment this to install this package
library(rvest)

marketwatch_wbpg <- read_html(
  "https://www.marketwatch.com/story/bitcoin-jumps-after-credit-scare-2018-10-15"
)

marketwatch_wbpg %>%
  html_node("title") %>% #See HTML source code for data within this tag
  html_text()

marketwatch_wbpg %>%
  html_nodes("p") %>% #See HTML source code for data within this tag
  html_text()

# Let's read in all news on Bitcoin using the
# Marketwatch source
marketwatch_bitcoin_articles <- read_html(
  "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false"
)

# Grab all URLs on the page
urls <- marketwatch_bitcoin_articles %>%
  html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag
  html_attr("href")

urls

# Grab all datetimes on the page
datetime <- marketwatch_bitcoin_articles %>%
  html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag
  html_text()

datetime

# Only first few datetimes are included in this div
# Grab datetimes for all URLs on the landing page
# and add these onto to the end of datetime vector
datetime2 <- marketwatch_bitcoin_articles %>%
  html_nodes("div.deemphasized span") %>%
  html_text()

datetime2

# Check the index where datetimes for other 
# URLs start and loop through from that 
# index to the end of all entries
for(i in datetime2[13:length(datetime2)]){
  datetime <- c(datetime, i)
}

datetime

# Convert datetime text to a standard time format
#install.packages("lubridate") #Uncomment this to install this package
library(lubridate)

# First remove periods from datetime, as lubridate 
# cannot interpret a.m. and p.m. with periods
datetime_clean <- gsub("\\.","",datetime)

datetime_parse <- parse_date_time(
  datetime_clean, "%I:%M %p %m/%d/%Y"
)
datetime_parse

# Convert all ET (Eastern Time) datetime values to 
# your local time - e.g. PT (Pacific Time)
datetime_convert <- ymd_hms(
  datetime_parse, tz = "US/Eastern"
)
datetime_convert <- with_tz(
  datetime_convert, "US/Pacific"
)
datetime_convert

# Create a dataframe containing the urls of the web 
# pages and their converted datetimes
marketwatch_webpgs_datetimes <- data.frame(
  WebPg=urls, DateTime=datetime_convert
)
dim(marketwatch_webpgs_datetimes)

# Take the difference between the your current time
# and the published datetime of the web pg and add 
# as a column to the dataframe
diff_in_hours <- difftime(
  Sys.time(), marketwatch_webpgs_datetimes$DateTime, units = "hours"
)
diff_in_hours
diff_in_hours <- as.double(diff_in_hours)
diff_in_hours
marketwatch_webpgs_datetimes$DiffHours <- diff_in_hours
head(marketwatch_webpgs_datetimes)

# Filter rows of the dataframe that contain 
# DiffHours of less than an hour
marketwatch_latest_data <- subset(
  marketwatch_webpgs_datetimes, DiffHours < 1
)
marketwatch_latest_data

# Loop through web pg URLs, read and grab the title 
# and body text, and store in a dataframe to get 
# the data ready for analysis
titles <- c()
bodies <- c()
for(i in marketwatch_latest_data$WebPg){
  
  marketwatch_latest_wbpg <- read_html(i)
  title <- marketwatch_latest_wbpg %>%
    html_node("title") %>%
    html_text()
  titles <- append(titles, title)
  
  marketwatch_latest_wbpg <- read_html(i)
  body <- marketwatch_latest_wbpg %>%
    html_nodes("p") %>%
    html_text()
  one_body <- paste(body, collapse=" ")
  bodies <- append(bodies, one_body)
  
}

marketwatch_latest_data$Title <- titles
marketwatch_latest_data$Body <- bodies

names(marketwatch_latest_data)
marketwatch_latest_data$Title
marketwatch_latest_data$Body[1]


# Summarize the body of the text to extract the most 
# relevant, key info

# Note: There are other ways to analyze the text:
# Learn text analytics/natural language processing 
# and important machine learning concepts: 
# https://datasciencedojo.com/bootcamp/curriculum/ 

# Before summarizing the text, we need to clean it 
# of uneccessary whitespace, new lines, etc 
#install.packages("stringr") #Uncomment this to install this package
library(stringr)
clean_text_bodies <- str_squish(
  marketwatch_latest_data$Body
  )
clean_text_bodies[1]

# Loop through each body text and grab the top 3 
# sentences with the most relevant information
#install.packages("LSAfun") #Uncomment this to install this package
library(LSAfun)
summary <- c()
for(i in clean_text_bodies){
  top_info <- genericSummary(i,k=3);
  one_summary <- paste(top_info, collapse=" ")
  summary <- append(summary, one_summary)
}

summary

marketwatch_latest_data$Summary <- summary

# Email the results of the summaries, along with 
# the titles
#install.packages("sendmailR") #Uncomment this to install this package
library(sendmailR)

marketwatch_title_summary <- c()
for(i in 1:length(marketwatch_latest_data$Summary)){
  marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Title[i])
  marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Summary[i])
}

marketwatch_title_summary

from <- "<rebecca.merrett@gmail.com>"
to <- "<rebecca.merrett@gmail.com>"
subject <- "Hourly Summary of Bitcoin Events"
body <- marketwatch_title_summary             
mailControl <- list(smtpServer="ASPMX.L.GOOGLE.COM") #Use Google for Gmail accounts

sendmail(from=from,to=to,subject=subject,msg=body,control=mailControl)

#Schedule this script to run every hour