Added Web Scraping with R code examples
Showing
web_scraping_R-master/README.md
0 → 100644
| # Automated Web Scraping in R | |||
| # Let's start with a quick demonstration of scraping | |||
| # the main head and body text of a single web page | |||
| #install.packages("rvest") #Uncomment this to install this package | |||
| library(rvest) | |||
| marketwatch_wbpg <- read_html( | |||
| "https://www.marketwatch.com/story/bitcoin-jumps-after-credit-scare-2018-10-15" | |||
| ) | |||
| marketwatch_wbpg %>% | |||
| html_node("title") %>% #See HTML source code for data within this tag | |||
| html_text() | |||
| marketwatch_wbpg %>% | |||
| html_nodes("p") %>% #See HTML source code for data within this tag | |||
| html_text() | |||
| # Let's read in all news on Bitcoin using the | |||
| # Marketwatch source | |||
| marketwatch_bitcoin_articles <- read_html( | |||
| "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false" | |||
| ) | |||
| # Grab all URLs on the page | |||
| urls <- marketwatch_bitcoin_articles %>% | |||
| html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag | |||
| html_attr("href") | |||
| urls | |||
| # Grab all datetimes on the page | |||
| datetime <- marketwatch_bitcoin_articles %>% | |||
| html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag | |||
| html_text() | |||
| datetime | |||
| # Only first few datetimes are included in this div | |||
| # Grab datetimes for all URLs on the landing page | |||
| # and add these onto to the end of datetime vector | |||
| datetime2 <- marketwatch_bitcoin_articles %>% | |||
| html_nodes("div.deemphasized span") %>% | |||
| html_text() | |||
| datetime2 | |||
| # Check the index where datetimes for other | |||
| # URLs start and loop through from that | |||
| # index to the end of all entries | |||
| for(i in datetime2[13:length(datetime2)]){ | |||
| datetime <- c(datetime, i) | |||
| } | |||
| datetime | |||
| # Convert datetime text to a standard time format | |||
| #install.packages("lubridate") #Uncomment this to install this package | |||
| library(lubridate) | |||
| # First remove periods from datetime, as lubridate | |||
| # cannot interpret a.m. and p.m. with periods | |||
| datetime_clean <- gsub("\\.","",datetime) | |||
| datetime_parse <- parse_date_time( | |||
| datetime_clean, "%I:%M %p %m/%d/%Y" | |||
| ) | |||
| datetime_parse | |||
| # Convert all ET (Eastern Time) datetime values to | |||
| # your local time - e.g. PT (Pacific Time) | |||
| datetime_convert <- ymd_hms( | |||
| datetime_parse, tz = "US/Eastern" | |||
| ) | |||
| datetime_convert <- with_tz( | |||
| datetime_convert, "US/Pacific" | |||
| ) | |||
| datetime_convert | |||
| # Create a dataframe containing the urls of the web | |||
| # pages and their converted datetimes | |||
| marketwatch_webpgs_datetimes <- data.frame( | |||
| WebPg=urls, DateTime=datetime_convert | |||
| ) | |||
| dim(marketwatch_webpgs_datetimes) | |||
| # Take the difference between the your current time | |||
| # and the published datetime of the web pg and add | |||
| # as a column to the dataframe | |||
| diff_in_hours <- difftime( | |||
| Sys.time(), marketwatch_webpgs_datetimes$DateTime, units = "hours" | |||
| ) | |||
| diff_in_hours | |||
| diff_in_hours <- as.double(diff_in_hours) | |||
| diff_in_hours | |||
| marketwatch_webpgs_datetimes$DiffHours <- diff_in_hours | |||
| head(marketwatch_webpgs_datetimes) | |||
| # Filter rows of the dataframe that contain | |||
| # DiffHours of less than an hour | |||
| marketwatch_latest_data <- subset( | |||
| marketwatch_webpgs_datetimes, DiffHours < 1 | |||
| ) | |||
| marketwatch_latest_data | |||
| # Loop through web pg URLs, read and grab the title | |||
| # and body text, and store in a dataframe to get | |||
| # the data ready for analysis | |||
| titles <- c() | |||
| bodies <- c() | |||
| for(i in marketwatch_latest_data$WebPg){ | |||
| marketwatch_latest_wbpg <- read_html(i) | |||
| title <- marketwatch_latest_wbpg %>% | |||
| html_node("title") %>% | |||
| html_text() | |||
| titles <- append(titles, title) | |||
| marketwatch_latest_wbpg <- read_html(i) | |||
| body <- marketwatch_latest_wbpg %>% | |||
| html_nodes("p") %>% | |||
| html_text() | |||
| one_body <- paste(body, collapse=" ") | |||
| bodies <- append(bodies, one_body) | |||
| } | |||
| marketwatch_latest_data$Title <- titles | |||
| marketwatch_latest_data$Body <- bodies | |||
| names(marketwatch_latest_data) | |||
| marketwatch_latest_data$Title | |||
| marketwatch_latest_data$Body[1] | |||
| # Summarize the body of the text to extract the most | |||
| # relevant, key info | |||
| # Note: There are other ways to analyze the text: | |||
| # Learn text analytics/natural language processing | |||
| # and important machine learning concepts: | |||
| # https://datasciencedojo.com/bootcamp/curriculum/ | |||
| # Before summarizing the text, we need to clean it | |||
| # of uneccessary whitespace, new lines, etc | |||
| #install.packages("stringr") #Uncomment this to install this package | |||
| library(stringr) | |||
| clean_text_bodies <- str_squish( | |||
| marketwatch_latest_data$Body | |||
| ) | |||
| clean_text_bodies[1] | |||
| # Loop through each body text and grab the top 3 | |||
| # sentences with the most relevant information | |||
| #install.packages("LSAfun") #Uncomment this to install this package | |||
| library(LSAfun) | |||
| summary <- c() | |||
| for(i in clean_text_bodies){ | |||
| top_info <- genericSummary(i,k=3); | |||
| one_summary <- paste(top_info, collapse=" ") | |||
| summary <- append(summary, one_summary) | |||
| } | |||
| summary | |||
| marketwatch_latest_data$Summary <- summary | |||
| # Email the results of the summaries, along with | |||
| # the titles | |||
| #install.packages("sendmailR") #Uncomment this to install this package | |||
| library(sendmailR) | |||
| marketwatch_title_summary <- c() | |||
| for(i in 1:length(marketwatch_latest_data$Summary)){ | |||
| marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Title[i]) | |||
| marketwatch_title_summary <- append(marketwatch_title_summary, marketwatch_latest_data$Summary[i]) | |||
| } | |||
| marketwatch_title_summary | |||
| from <- "<[email protected]>" | |||
| to <- "<[email protected]>" | |||
| subject <- "Hourly Summary of Bitcoin Events" | |||
| body <- marketwatch_title_summary | |||
| mailControl <- list(smtpServer="ASPMX.L.GOOGLE.COM") #Use Google for Gmail accounts | |||
| sendmail(from=from,to=to,subject=subject,msg=body,control=mailControl) | |||
| #Schedule this script to run every hour | |||
| \ No newline at end of file |
Please
register
or
sign in
to comment