Added Web Scraping with R code examples
Showing
web_scraping_R-master/README.md
0 → 100644
| # Automated Web Scraping in R | ||
| # Let's start with a quick demonstration of scraping | ||
| # the main head and body text of a single web page | ||
| #install.packages("rvest") #Uncomment this to install this package | ||
| library(rvest) | ||
| marketwatch_wbpg <- read_html( | ||
| "https://www.marketwatch.com/story/bitcoin-jumps-after-credit-scare-2018-10-15" | ||
| ) | ||
| marketwatch_wbpg %>% | ||
| html_node("title") %>% #See HTML source code for data within this tag | ||
| html_text() | ||
| marketwatch_wbpg %>% | ||
| html_nodes("p") %>% #See HTML source code for data within this tag | ||
| html_text() | ||
| # Let's read in all news on Bitcoin using the | ||
| # Marketwatch source | ||
| marketwatch_bitcoin_articles <- read_html( | ||
| "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false" | ||
| ) | ||
| # Grab all URLs on the page | ||
| urls <- marketwatch_bitcoin_articles %>% | ||
| html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag | ||
| html_attr("href") | ||
| urls | ||
| # Grab all datetimes on the page | ||
| datetime <- marketwatch_bitcoin_articles %>% | ||
| html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag | ||
| html_text() | ||
| datetime | ||
| # Only first few datetimes are included in this div | ||
| # Grab datetimes for all URLs on the landing page | ||
| # and add these onto to the end of datetime vector | ||
| datetime2 <- marketwatch_bitcoin_articles %>% | ||
| html_nodes("div.deemphasized span") %>% | ||
| html_text() | ||
| datetime2 | ||
| # Check the index where datetimes for other | ||
| # URLs start and loop through from that | ||
| # index to the end of all entries | ||
| for(i in datetime2[13:length(datetime2)]){ | ||
| datetime <- c(datetime, i) | ||
| } | ||
| datetime | ||
| # Convert datetime text to a standard time format | ||
| #install.packages("lubridate") #Uncomment this to install this package | ||
| library(lubridate) | ||
| # First remove periods from datetime, as lubridate | ||
| # cannot interpret a.m. and p.m. with periods | ||
| datetime_clean <- gsub("\\.","",datetime) | ||
| datetime_parse <- parse_date_time( | ||
| datetime_clean, "%I:%M %p %m/%d/%Y" | ||
| ) | ||
| datetime_parse | ||
| # Convert all ET (Eastern Time) datetime values to | ||
| # your local time - e.g. PT (Pacific Time) | ||
| datetime_convert <- ymd_hms( | ||
| datetime_parse, tz = "US/Eastern" | ||
| ) | ||
| datetime_convert <- with_tz( | ||
| datetime_convert, "US/Pacific" | ||
| ) | ||
| datetime_convert | ||
| # Create a dataframe containing the urls of the web | ||
| # pages and their converted datetimes | ||
| marketwatch_webpgs_datetimes <- data.frame( | ||
| WebPg=urls, DateTime=datetime_convert | ||
| ) | ||
| dim(marketwatch_webpgs_datetimes) | ||
| # Take the difference between the your current time | ||
| # and the published datetime of the web pg and add | ||
| # as a column to the dataframe | ||
| diff_in_hours <- difftime( | ||
| Sys.time(), marketwatch_webpgs_datetimes$DateTime, units = "hours" | ||
| ) | ||
| diff_in_hours | ||
| diff_in_hours <- as.double(diff_in_hours) | ||
| diff_in_hours | ||
| marketwatch_webpgs_datetimes$DiffHours <- diff_in_hours | ||
| head(marketwatch_webpgs_datetimes) | ||
| # Filter rows of the dataframe that contain | ||
| # DiffHours of less than an hour | ||
| marketwatch_latest_data <- subset( | ||
| marketwatch_webpgs_datetimes, DiffHours < 1 | ||
| ) | ||
| marketwatch_latest_data | ||
| # Loop through web pg URLs, read and grab the title | ||
| # and body text, and store in a dataframe to get | ||
| # the data ready for analysis | ||
| titles <- c() | ||
| bodies <- c() | ||
| for(i in marketwatch_latest_data$WebPg){ | ||
| marketwatch_latest_wbpg <- read_html(i) | ||
| title <- marketwatch_latest_wbpg %>% | ||
| html_node("title") %>% | ||
| html_text() | ||
| titles <- append(titles, title) | ||
| marketwatch_latest_wbpg <- read_html(i) | ||
| body <- marketwatch_latest_wbpg %>% | ||
| html_nodes("p") %>% | ||
| html_text() | ||
| one_body <- paste(body, collapse=" ") | ||
| bodies <- append(bodies, one_body) | ||
| } | ||
| marketwatch_latest_data$Title <- titles | ||
| marketwatch_latest_data$Body <- bodies | ||
| names(marketwatch_latest_data) | ||
| marketwatch_latest_data$Title | ||
| marketwatch_latest_data$Body[1] | ||
| # Summarize the body of the text to extract the most | ||
| # relevant, key info | ||
| # Note: There are other ways to analyze the text: | ||
| # Learn text analytics/natural language processing | ||
| # and important machine learning concepts: | ||
| # https://datasciencedojo.com/bootcamp/curriculum/ | ||
| # Before summarizing the text, we need to clean it | ||
| # of uneccessary whitespace, new lines, etc | ||
| #install.packages("stringr") #Uncomment this to install this package | ||
| library(stringr) | ||
| clean_text_bodies <- str_squish( | ||
| marketwatch_latest_data$Body | ||
| ) | ||
| clean_text_bodies[1] | ||
| # Loop through each body text and grab the top 3 | ||
| # sentences with the most relevant information | ||
| #install.packages("LSAfun") #Uncomment this to install this package | ||
| library(LSAfun) | ||
| summary <- c() | ||
| for(i in clean_text_bodies){ | ||
| top_info <- genericSummary(i,k=3); | ||
| one_summary <- paste(top_info, collapse=" ") | ||
| summary <- append(summary, one_summary) | ||
| } | ||
| summary | ||
| marketwatch_latest_data$Summary <- summary | ||
| # Email the results of the summaries, along with | ||
| # the titles | ||
| #install.packages("sendmailR") #Uncomment this to install this package | ||
| library(sendmailR) | ||
| marketwatch_title_summary <- c() | ||
| for(i in 1:length(marketwatch_latest_data$Summary)){ | ||
| marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Title[i]) | ||
| marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Summary[i]) | ||
| } | ||
| marketwatch_title_summary | ||
| from <- "<[email protected]>" | ||
| to <- "<[email protected]>" | ||
| subject <- "Hourly Summary of Bitcoin Events" | ||
| body <- marketwatch_title_summary | ||
| mailControl <- list(smtpServer="ASPMX.L.GOOGLE.COM") #Use Google for Gmail accounts | ||
| sendmail(from=from,to=to,subject=subject,msg=body,control=mailControl) | ||
| #Schedule this script to run every hour | ||
| \ No newline at end of file |
Please
register
or
sign in
to comment