-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathText_Analysis.R
91 lines (76 loc) · 2.75 KB
/
Text_Analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
library(tidyverse)
library(tidytext)
library(pdftools)
library(wordcloud)
library(topicmodels)
library(textdata)
library(igraph)
library(ggraph)
library(arrow)
data(stop_words)
text_complete <- pdftools::pdf_text("App/www/Annual_Reports/1982-1989.pdf") %>%
readr::read_lines() %>%
dplyr::tibble() %>%
dplyr::rename('text' = '.') %>%
tidytext::unnest_tokens(word, text, token = "ngrams") %>%
dplyr::anti_join(stop_words) %>%
dplyr::filter(word != "x x x") %>%
dplyr::mutate(word = base::gsub("\\b\\d+\\b", NA, word),
word = base::gsub("x x", "", word),
word = base::gsub("cases n", "", word),
Year = "1982-1989") %>%
dplyr::filter(word != " s", word != "s s s", word != "s s a", word != "x s s", word != "max size mm", word != "min size mm") %>%
tidyr::drop_na() %>%
dplyr::group_by(Year) %>%
dplyr::count(word, sort = TRUE) %>%
dplyr::arrange(desc(n)) %>%
utils::head(500)
for (yr in 1990:2013) {
text <-
pdftools::pdf_text(
glue::glue("App/www/Annual_Reports/{yr}.pdf")) %>%
readr::read_lines() %>%
dplyr::tibble() %>%
dplyr::rename('text' = '.') %>%
tidytext::unnest_tokens(word, text, token = "ngrams") %>%
dplyr::anti_join(stop_words) %>%
dplyr::filter(word != "x x x") %>%
dplyr::mutate(word = base::gsub("\\b\\d+\\b", NA, word),
word = base::gsub("x x", "", word),
word = base::gsub("cases n", "", word),
Year = as.character(yr)) %>%
tidyr::drop_na() %>%
dplyr::filter(word != " s", word != "s s s", word != "s s a", word != "x s s", word != "max size mm", word != "min size mm") %>%
dplyr::group_by(Year) %>%
dplyr::count(word, sort = TRUE) %>%
dplyr::arrange(desc(n)) %>%
utils::head(500)
text_complete <- rbind(text_complete, text)
}
text_all_summary <- text_complete %>%
dplyr::group_by(word) %>%
dplyr::summarise(n = sum(n)) %>%
dplyr::arrange(desc(n)) %>%
dplyr::mutate(Year = "All Years") %>%
base::rbind(text_complete) %>%
# readr::write_csv("App/Tidy_Data/Text.csv")
arrow::write_feather("App/Tidy_Data/Text.feather")
# text_all_sentiment <- text_complete %>%
# inner_join(get_sentiments("afinn")) %>%
# filter(n > 1)
#
# ggplot(text_all_sentiment, aes(word, value, fill = Year)) +
# geom_col(show.legend = FALSE) +
# facet_wrap(~Year, ncol = 10, scales = "free_x")
# text_all_summary %>%
# filter(n > 100) %>%
# arrange(dplyr::desc(n)) %>%
# dplyr::mutate(word = reorder(word, n)) %>%
# ggplot(aes(n, word)) +
# geom_col() +
# labs(y = NULL)
# wordcloud::wordcloud(
# words = text_all_summary$word,
# freq = text_all_summary$n, min.freq = 1,
# max.words = 250, random.order = FALSE, rot.per = 0,
# colors = brewer.pal(8, "Dark2"))