Added Web Scraping with R code examples
Showing
web_scraping_R-master/README.md
0 → 100644
# Automated Web Scraping in R | ||
# Let's start with a quick demonstration of scraping | ||
# the main head and body text of a single web page | ||
#install.packages("rvest") #Uncomment this to install this package | ||
library(rvest) | ||
marketwatch_wbpg <- read_html( | ||
"https://www.marketwatch.com/story/bitcoin-jumps-after-credit-scare-2018-10-15" | ||
) | ||
marketwatch_wbpg %>% | ||
html_node("title") %>% #See HTML source code for data within this tag | ||
html_text() | ||
marketwatch_wbpg %>% | ||
html_nodes("p") %>% #See HTML source code for data within this tag | ||
html_text() | ||
# Let's read in all news on Bitcoin using the | ||
# Marketwatch source | ||
marketwatch_bitcoin_articles <- read_html( | ||
"https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false" | ||
) | ||
# Grab all URLs on the page | ||
urls <- marketwatch_bitcoin_articles %>% | ||
html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag | ||
html_attr("href") | ||
urls | ||
# Grab all datetimes on the page | ||
datetime <- marketwatch_bitcoin_articles %>% | ||
html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag | ||
html_text() | ||
datetime | ||
# Only first few datetimes are included in this div | ||
# Grab datetimes for all URLs on the landing page | ||
# and add these onto to the end of datetime vector | ||
datetime2 <- marketwatch_bitcoin_articles %>% | ||
html_nodes("div.deemphasized span") %>% | ||
html_text() | ||
datetime2 | ||
# Check the index where datetimes for other | ||
# URLs start and loop through from that | ||
# index to the end of all entries | ||
for(i in datetime2[13:length(datetime2)]){ | ||
datetime <- c(datetime, i) | ||
} | ||
datetime | ||
# Convert datetime text to a standard time format | ||
#install.packages("lubridate") #Uncomment this to install this package | ||
library(lubridate) | ||
# First remove periods from datetime, as lubridate | ||
# cannot interpret a.m. and p.m. with periods | ||
datetime_clean <- gsub("\\.","",datetime) | ||
datetime_parse <- parse_date_time( | ||
datetime_clean, "%I:%M %p %m/%d/%Y" | ||
) | ||
datetime_parse | ||
# Convert all ET (Eastern Time) datetime values to | ||
# your local time - e.g. PT (Pacific Time) | ||
datetime_convert <- ymd_hms( | ||
datetime_parse, tz = "US/Eastern" | ||
) | ||
datetime_convert <- with_tz( | ||
datetime_convert, "US/Pacific" | ||
) | ||
datetime_convert | ||
# Create a dataframe containing the urls of the web | ||
# pages and their converted datetimes | ||
marketwatch_webpgs_datetimes <- data.frame( | ||
WebPg=urls, DateTime=datetime_convert | ||
) | ||
dim(marketwatch_webpgs_datetimes) | ||
# Take the difference between the your current time | ||
# and the published datetime of the web pg and add | ||
# as a column to the dataframe | ||
diff_in_hours <- difftime( | ||
Sys.time(), marketwatch_webpgs_datetimes$DateTime, units = "hours" | ||
) | ||
diff_in_hours | ||
diff_in_hours <- as.double(diff_in_hours) | ||
diff_in_hours | ||
marketwatch_webpgs_datetimes$DiffHours <- diff_in_hours | ||
head(marketwatch_webpgs_datetimes) | ||
# Filter rows of the dataframe that contain | ||
# DiffHours of less than an hour | ||
marketwatch_latest_data <- subset( | ||
marketwatch_webpgs_datetimes, DiffHours < 1 | ||
) | ||
marketwatch_latest_data | ||
# Loop through web pg URLs, read and grab the title | ||
# and body text, and store in a dataframe to get | ||
# the data ready for analysis | ||
titles <- c() | ||
bodies <- c() | ||
for(i in marketwatch_latest_data$WebPg){ | ||
marketwatch_latest_wbpg <- read_html(i) | ||
title <- marketwatch_latest_wbpg %>% | ||
html_node("title") %>% | ||
html_text() | ||
titles <- append(titles, title) | ||
marketwatch_latest_wbpg <- read_html(i) | ||
body <- marketwatch_latest_wbpg %>% | ||
html_nodes("p") %>% | ||
html_text() | ||
one_body <- paste(body, collapse=" ") | ||
bodies <- append(bodies, one_body) | ||
} | ||
marketwatch_latest_data$Title <- titles | ||
marketwatch_latest_data$Body <- bodies | ||
names(marketwatch_latest_data) | ||
marketwatch_latest_data$Title | ||
marketwatch_latest_data$Body[1] | ||
# Summarize the body of the text to extract the most | ||
# relevant, key info | ||
# Note: There are other ways to analyze the text: | ||
# Learn text analytics/natural language processing | ||
# and important machine learning concepts: | ||
# https://datasciencedojo.com/bootcamp/curriculum/ | ||
# Before summarizing the text, we need to clean it | ||
# of uneccessary whitespace, new lines, etc | ||
#install.packages("stringr") #Uncomment this to install this package | ||
library(stringr) | ||
clean_text_bodies <- str_squish( | ||
marketwatch_latest_data$Body | ||
) | ||
clean_text_bodies[1] | ||
# Loop through each body text and grab the top 3 | ||
# sentences with the most relevant information | ||
#install.packages("LSAfun") #Uncomment this to install this package | ||
library(LSAfun) | ||
summary <- c() | ||
for(i in clean_text_bodies){ | ||
top_info <- genericSummary(i,k=3); | ||
one_summary <- paste(top_info, collapse=" ") | ||
summary <- append(summary, one_summary) | ||
} | ||
summary | ||
marketwatch_latest_data$Summary <- summary | ||
# Email the results of the summaries, along with | ||
# the titles | ||
#install.packages("sendmailR") #Uncomment this to install this package | ||
library(sendmailR) | ||
marketwatch_title_summary <- c() | ||
for(i in 1:length(marketwatch_latest_data$Summary)){ | ||
marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Title[i]) | ||
marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Summary[i]) | ||
} | ||
marketwatch_title_summary | ||
from <- "<[email protected]>" | ||
to <- "<[email protected]>" | ||
subject <- "Hourly Summary of Bitcoin Events" | ||
body <- marketwatch_title_summary | ||
mailControl <- list(smtpServer="ASPMX.L.GOOGLE.COM") #Use Google for Gmail accounts | ||
sendmail(from=from,to=to,subject=subject,msg=body,control=mailControl) | ||
#Schedule this script to run every hour | ||
\ No newline at end of file |
Please
register
or
sign in
to comment