forked from FlowWest/get-the-lead-out-statewide-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_data.R
112 lines (96 loc) · 3.51 KB
/
clean_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
library(tidyverse)
library(readxl)
library(stringr)
# library(dplyr)
# library(tidyr)
# library(readr)
raw_lead_data <- read_excel('data-raw/monthlypostingJan2020.xlsx', skip = 1)
glimpse(raw_lead_data)
tested_schools <- raw_lead_data %>%
group_by(SchoolName, XMOD) %>%
mutate(maxResult = max(RESULT), unit = 'ppb') %>%
select(XMOD, district = DISTRICT, schoolName = SchoolName,
schoolAddress = SchoolAddress, maxResult, unit) %>%
unique() %>%
ungroup() %>%
group_by(schoolName) %>%
mutate(maxResult = max(maxResult),
lead = case_when(XMOD != "<" & maxResult > 5 ~ TRUE,
XMOD == "<" & maxResult == 5 ~ FALSE,
TRUE ~ as.logical(NA)),
maxResult = ifelse(lead == FALSE, NA, maxResult),
status = "tested") %>%
filter(!is.na(lead)) %>%
select(-XMOD) %>%
unique()
exempt <- read_excel('data-raw/exemption_forms.xlsx')
glimpse(exempt)
exempt_schools <- exempt %>%
mutate(maxResult = NA, unit = NA, lead = NA, status = "exempt") %>%
select(district = `School District`, schoolName = Name, schoolAddress = Address,
maxResult, unit, lead, status)
# old not tested data, remove schools that have been tested now
un_tested <- read_excel('data-raw/SchoolsUnsampled.xlsx')
glimpse(un_tested)
not_tested <- un_tested %>%
select(district = District, schoolName = School) %>%
unique() %>%
mutate(status = 'not tested')
not_tested_schools <- tested_schools %>%
bind_rows(not_tested) %>%
group_by(district, schoolName, status) %>%
summarise(count = n()) %>%
spread(status, count) %>%
mutate(dup = sum(c(`not tested`, tested), na.rm = TRUE)) %>%
filter(dup == 1, is.na(tested)) %>%
ungroup() %>%
select(district, schoolName) %>%
unique() %>%
mutate(status = 'not tested')
all_schools <- tested_schools %>%
bind_rows(exempt_schools) %>%
bind_rows(not_tested_schools) %>%
arrange(district, schoolName)
# CLEAN UP DISTRICT NAMES
# check for dirty district naming
dirty_district <- all_schools %>%
select(district) %>%
unique %>%
arrange(district) %>%
separate(district, c('begin', 'rest'), ' ', remove = FALSE) %>%
mutate(prev_beg = lag(begin), next_beg = lead(begin), prev_district = lag(district),
next_district = lead(district),
match = case_when(
begin == prev_beg ~ prev_district,
begin == next_beg ~ next_district,
TRUE ~ 'no match'
)) %>%
filter(match != 'no match') %>%
select(district, match)
# district names from CDE https://www.cde.ca.gov/ds/si/ds/pubschls.asp
cde_districts <- read_excel('data-raw/pubdistricts.xlsx', skip = 5) %>%
select(district = District) %>%
mutate(in_cde = TRUE)
district_lookup <- dirty_district %>%
left_join(cde_districts) %>%
filter(is.na(in_cde)) %>%
select(-in_cde)
cleaned_data <- all_schools %>%
left_join(district_lookup) %>%
mutate(district = ifelse(is.na(match), district, match),
district = ifelse(is.na(district) | district == 'private', 'Private', district)) %>%
select(-match)
cleaned_data %>%
group_by(district, schoolName) %>%
mutate(count = n()) %>%
filter(count > 1) %>%
mutate(r = rank(schoolAddress, ties.method = 'first')) %>% View
# duplicates due to addresses
cleaned_data %>%
group_by(district, schoolName) %>%
arrange(district, schoolName) %>%
mutate(r = rank(schoolAddress, ties.method = 'first')) %>%
filter(r == 1) %>%
select(-r) %>%
write_csv('ca_schools_lead_testing_data.csv')
print('Project completed')