-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01-scrape-statements-solution.R
72 lines (56 loc) · 2.09 KB
/
01-scrape-statements-solution.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
library(tidyverse)
library(lubridate)
library(rvest)
# Read in the presidential statement html page
# Store it in an object called `dwight`
dwight <- read_html(x = "https://www.presidency.ucsb.edu/documents/special-message-the-congress-relative-space-science-and-exploration")
dwight
# Select nodes that are "a" elements
html_elements(x = dwight, css = "a")
# Use SelectorGadget to find a CSS selector that selects
# the document's speaker
html_elements(x = dwight, css = ".diet-title a")
# Extract the document's date, speaker, title, and full text
date <- html_elements(x = dwight, css = ".date-display-single") %>%
html_text2() %>% # Grab element text
mdy() # Format using lubridate
date
speaker <- html_elements(x = dwight, css = ".diet-title a") %>%
html_text2()
speaker
title <- html_elements(x = dwight, css = "h1") %>%
html_text2()
title
text <- html_elements(x = dwight, css = "div.field-docs-content") %>%
html_text2()
# This is a long document, so let's just display the first 1,000 characters
text %>% str_sub(1, 1000)
# Make a function called `scrape_docs()` that that accepts
# a URL of an individual document, scrapes the page,
# and returns a data frame containing the document's
# date, speaker, title, and full text.
scrape_doc <- function(url){
# get HTML page
url_contents <- read_html(x = url)
# extract elements we want
date <- html_elements(x = url_contents, css = ".date-display-single") %>%
html_text2() %>% # Grab element text
mdy() # Format using lubridate
speaker <- html_elements(x = url_contents, css = ".diet-title a") %>%
html_text2()
title <- html_elements(x = url_contents, css = "h1") %>%
html_text2()
text <- html_elements(x = url_contents, css = "div.field-docs-content") %>%
html_text2()
# store in a data frame
url_data <- tibble(
date = date,
speaker = speaker,
title = title,
text = text
)
# return the data frame
return(url_data)
}
# test on a new statement
scrape_doc("https://www.presidency.ucsb.edu/documents/letter-t-keith-glennan-administrator-national-aeronautics-and-space-administration")