Commit 097bc5bf by Rebecca Merrett

Update code to simplify the process of getting all relevant datetimes marked/not marked invisible

parent 8f08cc5c
...@@ -23,36 +23,22 @@ marketwatch_bitcoin_articles <- read_html( ...@@ -23,36 +23,22 @@ marketwatch_bitcoin_articles <- read_html(
"https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false" "https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false"
) )
# Grab all URLs on the page
urls <- marketwatch_bitcoin_articles %>%
html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag
html_attr("href")
urls
# Grab all datetimes on the page # Grab all datetimes on the page
datetime <- marketwatch_bitcoin_articles %>% datetime <- marketwatch_bitcoin_articles %>%
html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag html_nodes("div.deemphasized span") %>% #See HTML source code for data within this tag
html_text() html_text()
datetime datetime
# Only first few datetimes are included in this div # Filter datetimes that do not follow a consistent format
# Grab datetimes for all URLs on the landing page datetime2 <- c()
# and add these onto to the end of datetime vector for(i in datetime){
datetime2 <- marketwatch_bitcoin_articles %>% correct_datetime <- grep("Today", i, invert=T, value=T)
html_nodes("div.deemphasized span") %>% datetime2 <- append(datetime2, correct_datetime)
html_text()
datetime2
# Check the index where datetimes for other
# URLs start and loop through from that
# index to the end of all entries
for(i in datetime2[13:length(datetime2)]){
datetime <- c(datetime, i)
} }
datetime <- datetime2
datetime datetime
# Convert datetime text to a standard time format # Convert datetime text to a standard time format
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment