From b537a440947ba7bfcefc20a2bf33cd41a00bd05e Mon Sep 17 00:00:00 2001 From: Rebecca Merrett Date: Tue, 16 Apr 2019 20:36:41 +0000 Subject: [PATCH] Updating script to simplify the process of grabbing all relevant datetimes tagged differently during certain times of the day --- web_scraping_R-master/r_web_scraping_coded_example_share.R | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/web_scraping_R-master/r_web_scraping_coded_example_share.R b/web_scraping_R-master/r_web_scraping_coded_example_share.R index 2d26f5b..d634051 100644 --- a/web_scraping_R-master/r_web_scraping_coded_example_share.R +++ b/web_scraping_R-master/r_web_scraping_coded_example_share.R @@ -32,27 +32,20 @@ urls # Grab all datetimes on the page datetime <- marketwatch_bitcoin_articles %>% - html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag + html_nodes("div.deemphasized span") %>% #See HTML source code for data within this tag html_text() datetime -# Only first few datetimes are included in this div -# Grab datetimes for all URLs on the landing page -# and add these onto to the end of datetime vector -datetime2 <- marketwatch_bitcoin_articles %>% - html_nodes("div.deemphasized span") %>% - html_text() - -datetime2 - -# Check the index where datetimes for other -# URLs start and loop through from that -# index to the end of all entries -for(i in datetime2[13:length(datetime2)]){ - datetime <- c(datetime, i) +# Filter datetimes that do not follow a consistent format +datetime2 <- c() +for(i in datetime){ + correct_datetime <- grep("Today", i, invert=T, value=T) + datetime2 <- append(datetime2, correct_datetime) } +datetime <- datetime2 + datetime # Convert datetime text to a standard time format -- libgit2 0.26.0