Commit 097bc5bf by Rebecca Merrett

Update code to simplify the process of getting all relevant datetimes marked/not marked invisible

parent 8f08cc5c
......@@ -23,36 +23,22 @@ marketwatch_bitcoin_articles <- read_html(
"https://www.marketwatch.com/search?q=bitcoin&m=Keyword&rpp=15&mp=0&bd=false&rs=false"
)
# Grab all URLs on the page
urls <- marketwatch_bitcoin_articles %>%
html_nodes("div.searchresult a") %>% #See HTML source code for data within this tag
html_attr("href")
urls
# Grab all datetimes on the page
datetime <- marketwatch_bitcoin_articles %>%
html_nodes("div.deemphasized span.invisible") %>% #See HTML source code for data within this tag
html_nodes("div.deemphasized span") %>% #See HTML source code for data within this tag
html_text()
datetime
# Only first few datetimes are included in this div
# Grab datetimes for all URLs on the landing page
# and add these onto to the end of datetime vector
datetime2 <- marketwatch_bitcoin_articles %>%
html_nodes("div.deemphasized span") %>%
html_text()
datetime2
# Check the index where datetimes for other
# URLs start and loop through from that
# index to the end of all entries
for(i in datetime2[13:length(datetime2)]){
datetime <- c(datetime, i)
# Filter datetimes that do not follow a consistent format
datetime2 <- c()
for(i in datetime){
correct_datetime <- grep("Today", i, invert=T, value=T)
datetime2 <- append(datetime2, correct_datetime)
}
datetime <- datetime2
datetime
# Convert datetime text to a standard time format
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment