Commit 97577125 by Sanjay Pant

Added Web Scraping with R code examples

parent 4a53e20d
# Automated Web Scraping in R
Let’s imagine you would like to tap into news sources to analyze the events happening in Bitcoin or any in other topic area that frequently changes by the hour. These events could be analyzed to summarize the key changes or movements in Bitcoin, rate the overall sentiment of the recent discussions, capture important events, and so on. So you need a collection of recent Bitcoin events or news scraped every hour so that you can analyze these events.
This R script is an example of how to:
1. Write standard web scraping commands in R
2. Filter timely data based on time diffs
3. Analyze or summarize key information in the text
4. Send an email alert of the results of your analysis.
# Automated Web Scraping in R
# Let's start with a quick demonstration of scraping
# the main head and body text of a single web page
#install.packages("rvest") #Uncomment this to install this package
library(rvest)
marketwatch_wbpg <- read_html(
"https://www.marketwatch.com/story/bitcoin-jumps-after-credit-scare-2018-10-15"
)
marketwatch_wbpg %>%
html_node("title") %>% #See HTML source code for data within this tag
html_text()
marketwatch_wbpg %>%
html_nodes("p") %>% #See HTML source code for data within this tag
html_text()
# Let's read in all news on Bitcoin using the
# Marketwatch source
marketwatch_bitcoin_articles <- read_html(
"https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false"
)
# Grab all URLs on the page
urls <- marketwatch_bitcoin_articles %>%
html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag
html_attr("href")
urls
# Grab all datetimes on the page
datetime <- marketwatch_bitcoin_articles %>%
html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag
html_text()
datetime
# Only first few datetimes are included in this div
# Grab datetimes for all URLs on the landing page
# and add these onto to the end of datetime vector
datetime2 <- marketwatch_bitcoin_articles %>%
html_nodes("div.deemphasized span") %>%
html_text()
datetime2
# Check the index where datetimes for other
# URLs start and loop through from that
# index to the end of all entries
for(i in datetime2[13:length(datetime2)]){
datetime <- c(datetime, i)
}
datetime
# Convert datetime text to a standard time format
#install.packages("lubridate") #Uncomment this to install this package
library(lubridate)
# First remove periods from datetime, as lubridate
# cannot interpret a.m. and p.m. with periods
datetime_clean <- gsub("\\.","",datetime)
datetime_parse <- parse_date_time(
datetime_clean, "%I:%M %p %m/%d/%Y"
)
datetime_parse
# Convert all ET (Eastern Time) datetime values to
# your local time - e.g. PT (Pacific Time)
datetime_convert <- ymd_hms(
datetime_parse, tz = "US/Eastern"
)
datetime_convert <- with_tz(
datetime_convert, "US/Pacific"
)
datetime_convert
# Create a dataframe containing the urls of the web
# pages and their converted datetimes
marketwatch_webpgs_datetimes <- data.frame(
WebPg=urls, DateTime=datetime_convert
)
dim(marketwatch_webpgs_datetimes)
# Take the difference between the your current time
# and the published datetime of the web pg and add
# as a column to the dataframe
diff_in_hours <- difftime(
Sys.time(), marketwatch_webpgs_datetimes$DateTime, units = "hours"
)
diff_in_hours
diff_in_hours <- as.double(diff_in_hours)
diff_in_hours
marketwatch_webpgs_datetimes$DiffHours <- diff_in_hours
head(marketwatch_webpgs_datetimes)
# Filter rows of the dataframe that contain
# DiffHours of less than an hour
marketwatch_latest_data <- subset(
marketwatch_webpgs_datetimes, DiffHours < 1
)
marketwatch_latest_data
# Loop through web pg URLs, read and grab the title
# and body text, and store in a dataframe to get
# the data ready for analysis
titles <- c()
bodies <- c()
for(i in marketwatch_latest_data$WebPg){
marketwatch_latest_wbpg <- read_html(i)
title <- marketwatch_latest_wbpg %>%
html_node("title") %>%
html_text()
titles <- append(titles, title)
marketwatch_latest_wbpg <- read_html(i)
body <- marketwatch_latest_wbpg %>%
html_nodes("p") %>%
html_text()
one_body <- paste(body, collapse=" ")
bodies <- append(bodies, one_body)
}
marketwatch_latest_data$Title <- titles
marketwatch_latest_data$Body <- bodies
names(marketwatch_latest_data)
marketwatch_latest_data$Title
marketwatch_latest_data$Body[1]
# Summarize the body of the text to extract the most
# relevant, key info
# Note: There are other ways to analyze the text:
# Learn text analytics/natural language processing
# and important machine learning concepts:
# https://datasciencedojo.com/bootcamp/curriculum/
# Before summarizing the text, we need to clean it
# of uneccessary whitespace, new lines, etc
#install.packages("stringr") #Uncomment this to install this package
library(stringr)
clean_text_bodies <- str_squish(
marketwatch_latest_data$Body
)
clean_text_bodies[1]
# Loop through each body text and grab the top 3
# sentences with the most relevant information
#install.packages("LSAfun") #Uncomment this to install this package
library(LSAfun)
summary <- c()
for(i in clean_text_bodies){
top_info <- genericSummary(i,k=3);
one_summary <- paste(top_info, collapse=" ")
summary <- append(summary, one_summary)
}
summary
marketwatch_latest_data$Summary <- summary
# Email the results of the summaries, along with
# the titles
#install.packages("sendmailR") #Uncomment this to install this package
library(sendmailR)
marketwatch_title_summary <- c()
for(i in 1:length(marketwatch_latest_data$Summary)){
marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Title[i])
marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Summary[i])
}
marketwatch_title_summary
from <- "<[email protected]>"
to <- "<[email protected]>"
subject <- "Hourly Summary of Bitcoin Events"
body <- marketwatch_title_summary
mailControl <- list(smtpServer="ASPMX.L.GOOGLE.COM") #Use Google for Gmail accounts
sendmail(from=from,to=to,subject=subject,msg=body,control=mailControl)
#Schedule this script to run every hour
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment