Update code to simplify the process of getting all relevant datetimes marked/not marked invisible

097bc5bf · Rebecca Merrett · 8f08cc5c · 097bc5bf
Commit 097bc5bf authored Feb 06, 2019 by Rebecca Merrett
Show whitespace changes
Inline Side-by-side

Showing with 8 additions and 22 deletions

r_web_scraping_coded_example_share.R web_scraping_R-master/r_web_scraping_coded_example_share.R +8 -22

No files found.
--- a/web_scraping_R-master/r_web_scraping_coded_example_share.R
+++ b/web_scraping_R-master/r_web_scraping_coded_example_share.R
@@ -23,36 +23,22 @@ marketwatch_bitcoin_articles <- read_html(
  "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false"
 )
-# Grab all URLs on the page
-urls <- marketwatch_bitcoin_articles %>%
-  html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag
-  html_attr("href")
-urls
 # Grab all datetimes on the page
 datetime <- marketwatch_bitcoin_articles %>%
-  html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag
+  html_nodes("div.deemphasized span") %>% #See HTML source code for data within this tag
  html_text()
 datetime
-# Only first few datetimes are included in this div
+# Filter datetimes that do not follow a consistent format
-# Grab datetimes for all URLs on the landing page
+datetime2 <- c()
-# and add these onto to the end of datetime vector
+for(i in datetime){
-datetime2 <- marketwatch_bitcoin_articles %>%
+  correct_datetime <- grep("Today", i, invert=T, value=T)
-  html_nodes("div.deemphasized span") %>%
+  datetime2 <- append(datetime2, correct_datetime)
-  html_text()
-datetime2
-# Check the index where datetimes for other 
-# URLs start and loop through from that 
-# index to the end of all entries
-for(i in datetime2[13:length(datetime2)]){
-  datetime <- c(datetime, i)
 }
+datetime <- datetime2
 datetime
 # Convert datetime text to a standard time format