From 097bc5bf65c075e94c8ca029e881c3986a8d79c3 Mon Sep 17 00:00:00 2001 From: Rebecca Merrett Date: Wed, 6 Feb 2019 19:51:47 +0000 Subject: [PATCH] Update code to simplify the process of getting all relevant datetimes marked/not marked invisible --- web_scraping_R-master/r_web_scraping_coded_example_share.R | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/web_scraping_R-master/r_web_scraping_coded_example_share.R b/web_scraping_R-master/r_web_scraping_coded_example_share.R index 2d26f5b..9e94769 100644 --- a/web_scraping_R-master/r_web_scraping_coded_example_share.R +++ b/web_scraping_R-master/r_web_scraping_coded_example_share.R @@ -23,36 +23,22 @@ marketwatch_bitcoin_articles <- read_html( "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false" ) -# Grab all URLs on the page -urls <- marketwatch_bitcoin_articles %>% - html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag - html_attr("href") - -urls - # Grab all datetimes on the page datetime <- marketwatch_bitcoin_articles %>% - html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag + html_nodes("div.deemphasized span") %>% #See HTML source code for data within this tag html_text() datetime -# Only first few datetimes are included in this div -# Grab datetimes for all URLs on the landing page -# and add these onto to the end of datetime vector -datetime2 <- marketwatch_bitcoin_articles %>% - html_nodes("div.deemphasized span") %>% - html_text() - -datetime2 - -# Check the index where datetimes for other -# URLs start and loop through from that -# index to the end of all entries -for(i in datetime2[13:length(datetime2)]){ - datetime <- c(datetime, i) +# Filter datetimes that do not follow a consistent format +datetime2 <- c() +for(i in datetime){ + correct_datetime <- grep("Today", i, invert=T, value=T) + datetime2 <- append(datetime2, correct_datetime) } +datetime <- datetime2 + datetime # Convert datetime text to a standard time format -- libgit2 0.26.0