-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWMN_analysis.R
3237 lines (2788 loc) · 152 KB
/
WMN_analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
library(shiny)
library(tidyverse)
#library(ggplot2)
library(ggmosaic)
library(DT)
#library(rdrop2)
#library(googledrive)
library(gargle)
library(fable)
library(tsibble)
# for wordcloud
library(RColorBrewer)
#library(wordcloud)
library(wordcloud2)
library(NLP)
library(tm)
library(stringr)
library(packcircles)
# pca stuff
library(ggfortify)
library(cluster)
library(factoextra) # clustering algorithms & visualization
# options(
# # whenever there is one account token found, use the cached token
# gargle_oauth_email = TRUE,
# # specify auth tokens should be stored in a hidden directory ".secrets"
# gargle_oauth_cache = "/.secrets"
# )
conditionalpanel_style_controls = paste("background: #EEEEEE", "padding: 15px", sep=";")
conditionalpanel_style_text = paste("background: #FFFFEE", "padding: 15px", sep=";")
wellpanel_style = "background: #FFFFEE"
left_style_text = paste("background: #FFFFEE", "padding: 15px", "border-width: thin", "border-color: grey", "border-style: solid; rounded", sep=";")
detail_style_text = paste("background: #EEEEEE", "padding: 15px", sep=";")
HR_style = "border-top: 2px solid black;"
##################################################
# if an error occurs, try to do it gracefully
css <- "
.shiny-output-error { visibility: hidden; }
.shiny-output-error:before {
visibility: visible;
content: 'An error occurred! Please contact the admin.'; }
}
"
ui <- fluidPage(
# HTML('<link rel="icon", href="favicon.png", type="image/png">'
# ),
list(
tags$head(
tags$style(
HTML("
.navbar-default .navbar-brand {color: #FFFFDD;}
.navbar-default .navbar-brand {color: #FFFFDD;}
.navbar-default .navbar-nav > li > a:hover {color: #FFFFDD;text-decoration:underline;}
")
),
)
),
navbarPage(title = "Exploring the Data: Who's Making News",
inverse = TRUE,
tabPanel("Seven days",
fluidRow(
column(4,
wellPanel(
h4(HTML("<b>Exploring the Data:</b> Who's Making News<br /><span><font size=-0.5>for Sex Crimes Involving Children?</font></span>")),
h4(HTML("Methods of data exploration for the 'Who's Making News' data set, from the amazing work done by Kristen Browde.")),
h4(HTML("Data from: <a href='https://www.whoismakingnews.com/', target='_blank'>www.whoismakingnews.com</a>")),
h4(htmlOutput("lastdate")),
checkboxGroupInput("includebubble", "Include in the plot:",
choiceNames = list("'Not Listed'"),
choiceValues =list("include_notlistedbubble")
)
),
wellPanel(style = wellpanel_style, h4(HTML("The motivation for these sets of pages was to create some interactive data exploration methods and alternative visualizations for the WMN data set. These pages also includes some simple supervised and unsupervised machine learning models. <b>(please see the tabs at the top of the page)</b><br /><br />More standard analyses of these data can be found elsewhere (for example, per capita analysis here: DataViz.fyi).<br /><br />I have cleaned up the data for some of these analyses, rather than leaving them completely 'as-is' (please see the code source, link below). Because this page is based on constantly updated data, entered by hand, and the source is maintained externally, formatting and other weird changes changes have occurred that break automated processes; updates to this analysis are not automatic (not ideal, but it keeps the page from crashing).<br /><br />Thus, the totals reported on some tabs may not be exactly the same as reported on the WMN website.<br /><br />All code used here and additional data files (see individual tabs) are available here: <a href='https://github.com/egraham/WMN', target='_blank'>github.com/egraham/WMN</a>"))),
wellPanel(style = wellpanel_style, h4(HTML("Check out my other Shiny data project using some fancy engineering to examine some physical phenomena in ecology: <a href='https://erksome.shinyapps.io/Fourier_and_Soil_Temps/', target='_blank'>Using Fourier Transform to Model Soil Temperatures</a>.<br /><br />Or, drop me a line: 'egraham.cens' at gmail!")))
),
column(8,
plotOutput(
outputId = "weekbubble", width = "100%", height = 600,
hover = hoverOpts(
id = "bubbleplot_hover",
delay = 200,
delayType = "debounce",
clip = TRUE,
nullOutside = TRUE
)
),
div(style = "position:relative",
uiOutput("bubbleplot_info", style = "pointer-events: none")
),
h5(style = detail_style_text, HTML("Above is a basic circle packing plot (a circle version of a treemap), with the size of the circle depending on the number of crimes and color schemes based on larger groups (see the 'Groups of People' tab).")
),
h5(style = detail_style_text, HTML("Hover your mouse over the bubbles for more info, found in the 'Relation' category of the data set. <u>Note</u>: These categories are a random sample of ten entries from the completed data set.")
)
)
)
),
################################################
tabPanel("Predicting Crime",
fluidRow(
column(4,
wellPanel(
radioButtons("modelselecter", "Pick the model used for prediction:",
c("Linear regression" = "linearmodel",
"Time series decomposition" = "decompmodel",
"Exponential smoothing (ETS)" = "daymodel"
),
selected = "linearmodel", inline = FALSE),
),
wellPanel(style = wellpanel_style,
h4(HTML("Predicting trends or when the next crime will occur is always strange (human behavior!), but we can use some simple tools to make some estimates, just for 'fun' and to see how these simple methods are better or worse than each other.<br /><br />Which is easiest to understand or to explain vs. the best predictor?")),
),
wellPanel(style = wellpanel_style,h5(HTML("Below are standard boxplots (median is the line, box extends to inter-quartile range, outliers as points) of the cumulative errors associated with the three models used to predict upcoming crime rates (calculated only since July 15, 2023, when sampling changed).<br /><br />Interestingly, it looks like the linear model is better at predicting than the Decomposition model, although the ETS model beats them both."))),
plotOutput(outputId = "allthree", height = "250px", width = "auto")
),
column(8,
conditionalPanel(condition="input.modelselecter == 'linearmodel'",
plotOutput(
outputId = "plotcumsum", width = "90%", height = 500,
),
h5(htmlOutput("cumsumlineartext"), style=detail_style_text),
plotOutput(outputId = "linearerrorplot",
width = "90%",
height = 500,
# tooltips
hover = hoverOpts(
id = "errorlineplot_hover",
delay = 200,
delayType = "debounce",
clip = TRUE,
nullOutside = TRUE
)
),
div(style = "position:relative",
uiOutput("errorlineplot_info", style = "pointer-events: none")
)
),
conditionalPanel(condition="input.modelselecter == 'daymodel'",
plotOutput(
outputId = "ets", width = "90%", height = 500,
),
h5(style = detail_style_text, HTML("Above, the actual and predicted crimes using ETS (exponential smoothing) time series forecast modeling, with black lines as the last month's data and red is the 'future' prediction. Here, ETS computes a weighted average (medians, actually) over the last 90 days of observations (but skipping this month - see note about data collection problems). This model has some advantages over the time series decomposition and if this is interesting to you, you should check out this paper: http://www.exponentialsmoothing.net (no hover data on this plot)<br /><br />Below, the errors of the ETS prediction, per day (hover for actual and predicted values).")),
plotOutput(outputId = "etserrorplot",
width = "90%",
height = 500,
# tooltips
hover = hoverOpts(
id = "erroretsplot_hover",
delay = 200,
delayType = "debounce",
clip = TRUE,
nullOutside = TRUE
)
),
div(style = "position:relative",
uiOutput("erroretsplot_info", style = "pointer-events: none")
)
),
conditionalPanel(condition="input.modelselecter == 'decompmodel'",
plotOutput(
outputId = "decomp", width = "90%", height = 500,
),
h5(style = detail_style_text, HTML("Above, predicted crimes using time series decomposition for a week of data, with a black line as the model for the day-of-the-week crime rate and the red point is the prediction for today. Please take a look at the 'About that line' page for a quick-and-dirty explanation of time series decomposition. The 'Seasonal' component after decomposition and the mean 'Trend' values for the last month were used to predict the crime rate on a daily basis over a week. (no hover data on this plot)<br /><br />Below, the errors of the decomposition prediction, per day (hover for actual and predicted values).")),
plotOutput(outputId = "derrorplot",
width = "90%",
height = 500,
# tooltips
hover = hoverOpts(
id = "errordplot_hover",
delay = 200,
delayType = "debounce",
clip = TRUE,
nullOutside = TRUE
)
),
div(style = "position:relative",
uiOutput("errordplot_info", style = "pointer-events: none")
)
),
)
)
),
################################################
tabPanel("About That Line",
fluidRow(
column(4,
wellPanel(
radioButtons("dataselecter", "Pick your output:",
c("Raw data" = "rawdata",
"Time series" = "timeseries"
),
selected = "rawdata", inline = FALSE)
),
wellPanel(style = wellpanel_style,
h4(HTML("The data collected for 'Who's Making News' has some interesting daily and longer-term trends. The main 'signal' observed here is a weekly rise and fall, which is probably not reflective of actual crimes committed - or is it?<br /><br />Fewer crimes are <b>reported</b> on Sundays. Are criminals taking the day off? Why should we think that sex crimes against children occurs uniformly, independent of day-of-week?<br /><br />Click on the 'Time series' radio button above to run a time series decomposition analysis on the data to expose any regular patterns. Then go back to the raw data - can you see the trends by eye?"))
),
),
column(8,
conditionalPanel(condition="input.dataselecter == 'rawdata'",
plotOutput(
outputId = "linedataraw",
width = "90%",
height = 500,
dblclick = "plot_dblclick",
brush = brushOpts(
id = "zoom_brush",
resetOnNew = TRUE,
delay=500,
delayType = "debounce"),
# tooltips
hover = hoverOpts(
id = "lineplot_hover",
delay = 200,
delayType = "debounce",
clip = TRUE,
nullOutside = TRUE
)
),
div(style = "position:relative",
uiOutput("lineplot_info", style = "pointer-events: none")
),
h5(style = detail_style_text, "Select a range by dragging your mouse, then double-click for a more detailed view and hover information. Double-click without a selection to return to full-scale."),
h5(style = detail_style_text, "Data have not been cleaned up, thus names displayed on hovering may not be the 'real' names associated with the crime.")
),
conditionalPanel(condition="input.dataselecter == 'timeseries'",
plotOutput(
outputId = "timeseries", width = "90%", height = 500,
),
h5(style = detail_style_text, HTML("Data in the 'Observed' panel at the top are the raw data, taken directly from the data set. This analysis, a Multiplicative Time Series Analysis, pulls out the 'Seasonal' component (middle plot), showing a highly regular periodicity to the data. Analysis also shows the increased rate of crime (the bottom 'Trend' row) where some sort of change in data collection occurred (a bigger web scrape?). As we collect more data, larger seasonal or annual trends will begin to appear in the 'Seasonal' part as the model tries to explain the variability on longer-term time frames.<br /><br />It would be interesting to compare the 'Trend' data here with other variables, like national weather or employment rates or ...? You can see the last big drop in crime numbers was associated with a backlog of events, rather than any drop in crime.<br /><br /> (no zooming in on this plot)"))
)
)
)
),
################################################
tabPanel("Groups of People",
fluidRow(
column(4,
wellPanel(
checkboxGroupInput("include", "Include in the plot:",
choiceNames = list("'Not Listed'", "Trans people","Drag Queens"),
choiceValues =list("include_notlisted", "include_trans", "include_dragqueen"),
)
),
wellPanel(style = wellpanel_style,
h4("This is the 'standard' way to show off numbers within the groups of data. Each larger group can be broken into sub-groups (hover over the columns).")
),
wellPanel(style = wellpanel_style, h5("People who are trans are highlighted in the categories where they are included, not counted separately from their group."))
),
column(8,
plotOutput(
outputId = "plot1", width = "90%", height = 500,
# tooltips
hover = hoverOpts(
id = "plot_hover_bar",
delay = 200,
delayType = "debounce",
clip = TRUE,
nullOutside = TRUE
)
),
div(style = "position:relative",
uiOutput("hover_info_bar", style = "pointer-events: none")
),
h5(style = detail_style_text, "Hover over the groups for more information. Group percentages are based on the inclusion or exclusion of the 'Not Listed' group."),
)
)
),
################################################
tabPanel("Names and Genders",
fluidRow(
column(4,
wellPanel(checkboxGroupInput("gender", "First names of those who have committed crimes against children:",
choiceNames = list("Typically Female names", "Typically Male names", "Uncommon names"),
choiceValues = list("gender_female", "gender_male", "gender_atypical"),
selected = "gender_male"
),
radioButtons("genderselecter", "Pick your output:",
c("Word cloud" = "word_cloud",
"Pie chart" = "piegender",
"Data table" = "data_table"),
selected = "word_cloud", inline = TRUE)
),
conditionalPanel(condition="input.genderselecter == 'word_cloud'",
wellPanel(
sliderInput("decimal", "Wordcloud size:",
min = 0.1, max = 2,
value = 0.5, step = 0.1),
p("Adjust the slider bar to be able to better visualize the names.")
),
wellPanel(style = wellpanel_style,
h4(HTML("Are there names that are more represented in this group of criminals than other names? Probably not (although check out the Data table using the radio button above), but it's interesting to look at.<br /><br />Word clouds have an appeal that is hard to deny, although word clouds are not great for analytical accuracy.<br /><br />Data from 'https://genderize.io' are included."))
),
wellPanel(style = wellpanel_style, h5("Unfortunately Wordcloud2 will also not print names that 'don't fit' and so as you play with the slider, some of the most common names may disappear, or the colors assigned to names will become un-synchronized with their gender."))
),
conditionalPanel(condition="input.genderselecter == 'piegender'",
wellPanel(style = wellpanel_style,
h4(HTML("Pie charts aren't much better than word clouds, but they can indicate some obvious trends. Here, the obvious trend is that males are problematic.<br /><br />Data from 'https://genderize.io' are included."))
)
),
conditionalPanel(condition="input.genderselecter == 'data_table'",
wellPanel(style = wellpanel_style,
h4(HTML("Here we can get at the question of whether the number of crimes committed by a 'name' is proportional to the frequency of that name in the general population - are some names more or less associated with sex crimes against children? If so, what could this possibly mean (other than needing a larger or unbiased sample size)?<br /><br />Sort by 'First Name' to see how common a name is in the WMN crime set versus sort by how common the name is in the US - they are not the same! (data from 'https://genderize.io'.)<br /><br />... Or, do you dare search for your own first name in the list!?")
)
),
h5(style = detail_style_text, HTML("<b>'Crimes'</b> indicates how many events were recorded under that name in the 'Who's Making News' data set.<br /><b>'Percent from WMN'</b> indicates percent of crimes under that name from the total for that gender.<br /><b>'US Proportion'</b> indicates the frequency that the name occurs in the Genderize database (assumed to be the current frequency in the US).")
)
)
),
column(8,
conditionalPanel(condition="input.genderselecter == 'word_cloud'",
wordcloud2Output(
outputId = "wordcloud"
),
div(style = "position:relative",
uiOutput("hover_wordinfo", style = "pointer-events: none")),
p(" "),
h5(style = detail_style_text, "Notes: I extracted the first names of the sex offenders from the WMN database, assuming one space between first and last name and that everyone has a first name (not true!). These names are sent in batches to the 'https://genderize.io/' database server, where they are compared to a database of 'known' genders in the US and the probabilities of genders with each name is returned. Obviously, there will be errors and names are not always exactly correlated with sex, gender, or expression of those concepts."),
p(" "),
h5(style = detail_style_text, "Entries such as might occur for first names in the WMN database like 'FL Sting Pasco', '18 men', or 'unnamed parent' will return first names that will not return a reasonable result in the genderize.io database (and I remove those for this analysis). Actual names like 'La Luz' may also not result in usable data (but I do not specifically remove these types of names). Thus, these data are a subset of the bigger Who's Making News data set and should be considered a random (??) sample.")
),
conditionalPanel(condition="input.genderselecter == 'piegender'",
plotOutput(
outputId = "pie1",
# tooltips
hover = hoverOpts(
id = "pie_hover",
delay = 200,
delayType = "debounce",
clip = TRUE,
nullOutside = TRUE
)
),
h5("Using all data (not from the selection on the left). An apparent, obvious trend.", style=detail_style_text),
div(style = "position:relative",
uiOutput("hover_pieinfo", style = "pointer-events: none")
)
),
conditionalPanel(condition="input.genderselecter == 'data_table'",
conditionalPanel(condition = "input.gender.indexOf('gender_male') > -1",
DT::dataTableOutput("malenamesTable"),
h5(HTML(" ")),
p(style = HR_style),
h5(HTML(" "))
),
conditionalPanel(condition = "input.gender.indexOf('gender_female') > -1",
DT::dataTableOutput("femalenamesTable"),
h5(HTML(" ")),
p(style = HR_style),
h5(HTML(" "))
),
conditionalPanel(condition = "input.gender.indexOf('gender_atypical') > -1",
DT::dataTableOutput("atypicalnamesTable"),
h5(HTML(" ")),
p(style = HR_style),
h5(HTML(" "))
)
)
)
)
),
################################################
tabPanel("States of People",
fluidRow(
column(4,
wellPanel(
selectInput(
inputId = "states",
label = HTML("An analysis using some state statistics.<br />Sort states by:"),
choices = list(
"Demographics" = c("Crimes Against Children"="by_crimes", "Alphabetical"="alpha", "Percent of Males"="pct_males", "Percent Adults with No Children"="pct_no_childs", "Percent Who Voted"="pct_voted"),
"Income and employment" = c("Median Annual Income"="median_income","Percent Below the Federal Poverty Level"="pct_below_FPL","Percent Part-Time Workers"="pct_part-time", "Percent Unemployed"="pct-unemployed"),
"Politics and money" = c("Political Party"="political", "Gross State Product"="gsp", "Per Capita Spending"="per_capita_spending", "Spending on 1° & 2° Education"="pcs_edu", "Spending on Assistance"="pcs_assistance","Spending on Corrections"="pcs_corrections")
)
)
),
conditionalPanel(condition = "input.states == 'by_crimes'",
wellPanel(style = wellpanel_style,
h4(HTML("We can use a simple 'Chi Square' type of analysis to look at crime rates also, setting the expected crime rate on a total average crime for the US, then using population per state to make a prediction - is one state higher or lower than the US average based on population?<br /><br />Crime data in this plot is expressed as a percentage of the difference between observed and expected crimes, relative to the expected (US average), using the population of each state. Thus, positive values indicate higher than expected crime rate percentage for a state (e.g., +100% is twice the US average crime rate for a state).<br /><br />Selected data from 'www.kff.org' are included."))
),
wellPanel(style = wellpanel_style,
h5(HTML("Note: many of these variables are not independent from each other and/or are strongly correlated!<br /><br />The data for the lollipop plot are also always sorted such that highest child sex crime rates are always nearest the left vertical axis, just for standardization."))
)
),
conditionalPanel(condition = "input.states != 'by_crimes'",
wellPanel(style = wellpanel_style,
h4(HTML("Examine the figures below to determine if assumptions have been violated (for example, choose 'Political Party'), or see below for the Shapiro-Wilk normality test. Any data not normally distributed may or may not be significant, regardless of the linear regression.<br /><br />Data from 'www.kff.org' included."))
),
h4(htmlOutput("shapiro"), style=detail_style_text),
plotOutput(height = "250px",
outputId = "scatterbyState"
),
plotOutput(height = "250px",
outputId = "q_q"
)
)
),
column(8,
plotOutput(
outputId = "byState",
# tooltips
hover = hoverOpts(
id = "state_hover",
delay = 200,
delayType = "debounce",
clip = TRUE,
nullOutside = TRUE
)
),
div(style = "position:relative",
uiOutput("hover_stateinfo", style = "pointer-events: none")
),
h4(htmlOutput("statetext1"), style=detail_style_text),
h5(htmlOutput("statetext2"), style=detail_style_text)
)
)
),
################################################
tabPanel("Crime and Punishment",
fluidRow(
column(4,
wellPanel(h4("Select plot and the variables to include."),
radioButtons("cpselecter", "Pick your output:",
c("Principal Component" = "pca", "K-Means" = "kmeans"),
selected = "pca", inline = TRUE
),
checkboxGroupInput("crime", "State statistics on per 100,000 people crime rates:",
choiceNames = list("Burglary","Larceny","Motor Vehicles", "Assault", "Murder", "Rape", "Robbery"),
choiceValues = list("Burglary_rate", "Larceny_rate", "Motor_rate", "Assault_rate", "Murder_rate", "Rape_rate", "Robbery_rate"),
selected = c("Burglary_rate", "Larceny_rate"), inline = TRUE
),
checkboxGroupInput("punishment", "State statistics on per capita punishment rates:",
choiceNames = list("State Prisons","Federal Prisons","Local Jails", "Youth Facilities", "Involuntary", "Probation", "Parole"),
choiceValues = list("State_Prisons_rate", "Federal_Prisons_rate", "Local_Jails_rate", "Youth_Facilities_rate", "Involuntary_Commitment_rate", "Probation_rate", "Parole_rate"),
selected = "State_Prisons_rate", inline = TRUE
),
checkboxInput("include_exp", "Include the Percent Difference from Expected Crime", value = TRUE
),
conditionalPanel(condition="input.cpselecter == 'kmeans'",
numericInput(
"kmeansnum",
"Number of Groups for K-Means",
3,
min = 1,
max = 8,
width = '75%'
)
)
),
conditionalPanel(condition="input.cpselecter == 'pca'",
wellPanel(style = wellpanel_style,
h4(HTML("Principal Component Analysis is a fun data exploration technique, an 'unsupervised machine learning' method, that collapses lots data variables into a 2-D representation. Here, the plot shows what variables are responsible for the most variation in a multi-variable data set (based on the check boxes above). Click on some boxes to see their effect. Also, see below and to the right for more info on PCA<br /><br />Data from 'ucr.fbi.gov' and 'www.prisonpolicy.org' are included."))
),
wellPanel(style = wellpanel_style, h5(HTML("Only the first two components are shown. When comparing the data to 'the Percent Difference from Expected Crime', the arrows (loadings) indicate how the variables are related to the 'Crime' arrow.<br /><br />An arrow in the same direction as the 'Crime' arrow indicates an increase in that variable is associated with an increase in crime. An arrow perpendicular (or not well-represented in the component) indicates not much correlation.")))
),
conditionalPanel(condition="input.cpselecter == 'kmeans'",
wellPanel(style = wellpanel_style,
h4(HTML("K-Means clustering is also an 'unsupervised machine learning' method (except you select how many clusters, so 'semi-supervised'?). It divides data into clusters that share similarities and that are dissimilar to the data in other clusters. Look at what variables are most represented in each cluster to get a feeling why these clusters were made, using the table to the right - what variables are most similar or most different between or within groups? Does this tell us anything about what might lead groups of states to have higher crime rates compared to other groups?<br /><br />Data from 'ucr.fbi.gov' and 'www.prisonpolicy.org' are included."))
),
wellPanel(style = wellpanel_style, h5("To calculate the clusters, usually a PCA is performed first. Then for every data point, the distance is measured from the selected number of 'centroids' (means) chosen. Centroids are then moved and the process continues until the distances are optimized. K-means is sensitive to initial values and outliers, so examine the clusters carefully."))
)
),
column(8,
conditionalPanel(condition="input.cpselecter == 'pca'",
h4(htmlOutput("pcatext")),
plotOutput(
outputId = "pcaplot"
),
h4(htmlOutput("pcatext1"), style=detail_style_text),
column(6,
h5(htmlOutput("pcatext2"), style=detail_style_text)
),
column(6,
plotOutput(
outputId = "pcacontributionspine"
)
)
),
conditionalPanel(condition="input.cpselecter == 'kmeans'",
h4(htmlOutput("kmeanstext")),
plotOutput(
outputId = "kmeansplot"
),
h4(htmlOutput("kmeanstext1"), style=detail_style_text),
DT::dataTableOutput("kmeansTable"),
column(6,
h5(htmlOutput("kmeanstext2"), style=detail_style_text)
),
column(6,
plotOutput(
outputId = "kmeansSilhouette"
)
)
)
)
)
)
)
)
server <- shinyServer(function(input,output, session)({
####### Some Functions ######
# Function to extract overall p-value of model
sorted_p <- function(state.lm) {
f <- summary(state.lm)$fstatistic
p <- pf(f[1],f[2],f[3],lower.tail=F)
attributes(p) <- NULL
return(p)
}
####### set up session and get data files
session$onSessionEnded(stopApp) # kill the shiny app when the window is closed
wmn_data_previous <- readRDS("wmn_data_previous.RDS")
wmn_names_genderized <- readRDS("wmn_names_genderized.RDS")
all_errors_longer <- readRDS("all_errors_longer.RDS")
# static files
KFF_data <- readRDS("KFF_data.RDS")
state_crime <- readRDS("state_crime.RDS")
state_punishment <- readRDS("state_punishment.RDS")
# From the Unified Crime Reporting Statistics and under the collaboration of the U.S. Department of Justice
# and the Federal Bureau of Investigation information crime statistics are available for public review.
# The following data set has information on the crime rates and totals for states across the United States
# for a wide range of years. The crime reports are divided into two main categories: property and violent crime.
# Property crime refers to burglary, larceny, and motor related crime while violent crime refers to assault, murder, rape, and robbery.
# These reports go from 1960 to 2019.
# https://ucr.fbi.gov/crime-in-the-u.s/2019/crime-in-the-u.s.-2019/downloads/download-printable-files
#state_crime <- read_csv("state_crime.csv")
# The non-profit, non-partisan Prison Policy Initiative produces cutting edge research to expose the broader harm of mass criminalization,
# and then sparks advocacy campaigns to create a more just society.
# https://www.prisonpolicy.org/reports/correctionalcontrol2023_data_appendix.html
# punishment rates per 100,000
#state_punishment <- read_csv("state_punishment.csv")
lastdate <- file.info("wmn_data_previous.RDS")$mtime
lastdate <- format(as.Date(lastdate), "%b %d, %Y")
output$lastdate <- renderText({
paste("Complete as of: ", lastdate, sep="")
})
##################### data processing ####################################
numcrimes <- max(row(wmn_data_previous))
conditionalpanel_style_controls = paste("background: #EEEEEE", "padding: 15px", sep=";")
conditionalpanel_style_text = paste("background: #FFFFEE", "padding: 15px", sep=";")
wellpanel_style = "background: #FFFFEE"
detail_style_text = paste("background: #EEEEEE", "padding: 15px", sep=";")
# count numbers in categories
category_counts <- wmn_names_genderized |>
group_by(Category) |>
summarize(Category_n = n()) |>
arrange(desc(Category_n))
# Recombine data set to include the broad categories with counts
names_category_counts <- inner_join(wmn_names_genderized, category_counts, by="Category")
names_category_counts <- names_category_counts |>
mutate(
big_category = case_when(
Category %in% c("Teachers/Aides","Coaches", "Day Care/Babysitters") ~ "Teachers",
Category %in% c("Pastors", "Priests/Brothers", "Church Employees", "Missionaries", "Mormon Leaders") ~ "Religious Employment",
Category %in% c("Family Members", "Family Friends/Neighbors") ~ "Family/Known",
Category == "Other" ~ "Other",
Category == "Politicians" ~ "Politicians",
Category == "Doctors" ~ "Doctors",
Category == "Police" ~ "Police",
Category == "Not Listed" ~ "Not Listed"
)
) |>
filter(!is.na(big_category))
# Get big category counts for stats in tooltips
big_category_counts <- names_category_counts |>
group_by(big_category) |>
summarize(big_category_n = n()) |>
arrange(desc(big_category_n))
total_count <- big_category_counts |>
summarize(total = sum(big_category_n))
named_total_count <- big_category_counts |>
filter(big_category != "Not Listed") |>
summarize(total = sum(big_category_n))
# Get numbers for stacked graph, each category with big categories redundant, long
stacked_barplot_numbers <- inner_join(names_category_counts, big_category_counts, by = "big_category") |>
distinct(Category, .keep_all = TRUE) |>
dplyr::select(Category, Category_n, big_category, big_category_n) |>
arrange(desc(big_category_n), desc(Category_n))
# make cumulative sums of the Category subsets to make tooltips easier to calculate
stacked_barplot_numbers <- stacked_barplot_numbers |>
group_by(big_category) |>
mutate(csum = cumsum(Category_n))
# Give names_category_counts the big category counts
names_category_counts <-
left_join(names_category_counts, big_category_counts, by = "big_category")
#### Process Trans and Drag Queens to put them on the chart as add-on lines
# in the categories of jobs where they belong as people
# Get how many trans and what categories
trans <- names_category_counts |>
filter(Trans) |>
left_join(stacked_barplot_numbers, by = "Category") |>
dplyr::select(Category, big_category.x) |>
rename('big_category' = big_category.x)
trans_categories <- unlist(lapply(trans$big_category, function (x) which(big_category_counts$big_category == x)))
trans <- trans |>
mutate(barrank = trans_categories)
# Get y-axis value (the max) from the stacked bar plot
trans <- inner_join(trans, stacked_barplot_numbers, by="Category") |>
dplyr::select(Category, big_category.x, barrank, Category_n, csum) |>
rename('big_category' = big_category.x)
# Get locations within the small category (Category) to place the lines, per person
# find number of trans people per category
trans_groups <- trans |>
group_by(Category, big_category, barrank, Category_n, csum) |>
summarize(.groups = "keep", trans_n = n())
# Get how many drag queens and what categories
dqueens <- names_category_counts |>
filter(DQueen) |>
left_join(stacked_barplot_numbers, by = "Category") |>
dplyr::select(Category, big_category.x) |>
rename('big_category' = big_category.x)
dqueens <- dqueens |>
group_by(Category, big_category) |>
summarize(.groups = "keep", Category_n = n())
#print(dqueens)
######################## Last 7 days bubble plot ###############################
# get the last 7 days of data (not necessarily from today)
wmn_data_previous_week <- wmn_data_previous |>
#filter(Date >= max(Date)-days(7)) |>
#group_by(dow = weekdays(Date)) |>
mutate(
category_color = case_when(
Category == "Teachers/Aides" ~ "#7A4419",
Category =="Coaches" ~ "#755C1B",
Category =="Day Care/Babysitters" ~ "#D7BE82",
Category == "Pastors" ~ "#FF6340",
Category =="Priests/Brothers" ~ "#F5365F",
Category =="Church Employees" ~ "#FFBC40",
Category =="Missionaries" ~ "#E88C46",
Category =="Mormon Leaders" ~ "#F5D22F",
Category == "Family Members" ~ "#697A29",
Category =="Family Friends/Neighbors" ~ "#B8B42D",
Category == "Other" ~ "#888888",
Category == "Politicians" ~ "#990000",
Category == "Doctors" ~ "#009900",
Category == "Police" ~ "#1C77C3",
Category == "Not Listed" ~ "#FFFFFF"
)
) |>
mutate(
text_color = case_when(
Category == "Teachers/Aides" ~ "#FFFFFF",
Category =="Coaches" ~ "#FFFFFF",
Category =="Day Care/Babysitters" ~ "#000000",
Category == "Pastors" ~ "#000000",
Category =="Priests/Brothers" ~ "#000000",
Category =="Church Employees" ~ "#000000",
Category =="Missionaries" ~ "#000000",
Category =="Mormon Leaders" ~ "#000000",
Category == "Family Members" ~ "#FFFFFF",
Category =="Family Friends/Neighbors" ~ "#000000",
Category == "Other" ~ "#FFFFFF",
Category == "Politicians" ~ "#FFFFFF",
Category == "Doctors" ~ "#000000",
Category == "Police" ~ "#FFFFFF",
Category == "Not Listed" ~ "#000000"
)
)
numcrimesweek <- max(row(wmn_data_previous_week))
numcrimesweektext <- paste("A total of ", numcrimes, sep="")
# count numbers in small categories
category_counts_bubble <- wmn_data_previous_week |>
group_by(Category) |>
mutate(Category_n = n()) |>
select(Category, Relation, Category_n, category_color, text_color)
big_category_counts_bubble <- category_counts_bubble |>
mutate(
big_category = case_when(
Category %in% c("Teachers/Aides","Coaches", "Day Care/Babysitters") ~ "Teachers",
Category %in% c("Pastors", "Priests/Brothers", "Church Employees", "Missionaries", "Mormon Leaders") ~ "Religious Employment",
Category %in% c("Family Members", "Family Friends/Neighbors") ~ "Family/Known",
Category == "Other" ~ "Other",
Category == "Politicians" ~ "Politicians",
Category == "Doctors" ~ "Doctors",
Category == "Police" ~ "Police",
Category == "Not Listed" ~ "Not Listed"
)
) |>
mutate(Category = str_replace(Category, "Teachers/Aides", "Teachers\nAides")) |>
mutate(Category = str_replace(Category, "Day Care/Babysitters", "Day Care\nBabysitters")) |>
mutate(Category = str_replace(Category, "Priests/Brothers", "Priests\nBrothers")) |>
mutate(Category = str_replace(Category, "Family Friends/Neighbors", "Friends\nNeighbors")) |>
mutate(Category = str_replace(Category, "Family Members", "Family\nMembers")) |>
mutate(Category = str_replace(Category, "Church Employees", "Church\nEmployees")) |>
mutate(Category = str_replace(Category, "Mormon Leaders", "Mormon\nLeaders")) |>
filter(!is.na(big_category)) |>
group_by(big_category) |>
mutate(big_category_n = n())
# turn into 7 day averages for Category_n and big_category_n
num_weeks <- as.numeric((max(wmn_data_previous$Date)-min(wmn_data_previous$Date))/7)
big_category_counts_bubble_w <- big_category_counts_bubble |>
mutate(Category_n = round(Category_n/num_weeks, 0)) |>
mutate(big_category_n = round(big_category_n/num_weeks,0)) |>
filter(Category_n > 0)
# Make the plot
output$weekbubble <- renderPlot({
showModal(modalDialog(
title = "Please note:",
"The WMN research project started in mid February 2023, and ran through May 23, 2024. Updates to the data set are still being processed, although new data collection has stopped."
))
# if include the "not listed
if(length(str_subset(input$includebubble, "include_notlistedbubble")) > 0) {
big_category_counts_summary_w <- big_category_counts_bubble_w |>
distinct(Category, .keep_all = TRUE)
# reduce size of text in plot
range_b <- c(3,6)
} else {
big_category_counts_summary_w <- big_category_counts_bubble_w |>
distinct(Category, .keep_all = TRUE)
big_category_counts_summary_w <- big_category_counts_summary_w |>
filter(Category != "Not Listed")
range_b <- c(4,7)
}
# Create data, group by big categories, sort by size and alpha (in case of size ties)
weekdata <- data.frame(group=paste(big_category_counts_summary_w$Category_n, big_category_counts_summary_w$Category,sep="\n"),
value=big_category_counts_summary_w$Category_n,
category_color=big_category_counts_summary_w$category_color,
text_color=big_category_counts_summary_w$text_color,
hovertextinfo=big_category_counts_summary_w$Category) |>
arrange(desc(big_category_counts_summary_w$Category_n), big_category_counts_summary_w$Category)
# Generate the layout. This function returns a dataframe with one line per bubble.
# It gives its center (x and y) and its radius, proportional of the value
packing <- circleProgressiveLayout(weekdata$value, sizetype='area')
# We can add these packing information to the initial data frame
weekdata_p <- cbind(weekdata, packing)
# Check that radius is proportional to value. We don't want a linear relationship, since it is the AREA that must be proportional to the value
# plot(data$radius, data$value)
# The next step is to go from one center + a radius to the coordinates of a circle that
# is drawn by a multitude of straight lines.
dat.gg <- circleLayoutVertices(packing, npoints=50) |>
mutate(category_color = weekdata_p$category_color[id]) |>
mutate(text_color = weekdata_p$text_color[id])
# Make the plot
bubbleplot <- ggplot() +
# Make the bubbles
geom_polygon(data = dat.gg, aes(x, y, group = id, fill=as.factor(id)),
colour = "black",
fill = dat.gg$category_color,
alpha=0.7) +
# Add text in the center of each bubble + control its size
geom_text(data = weekdata_p, aes(x, y, size=value, label = group), color="black") + #, color=weekdata_p$text_color
#geom_text(data = weekdata_p, aes(x, y, size=value + 0.07, label = group), color="black") +
scale_size_continuous(range = range_b) +
coord_equal()
# General theme:
bubbleplot <- bubbleplot +
theme_void() +
theme(legend.position="none") +
theme(panel.border = element_blank())
# add titles
bubbleplot <- bubbleplot +
labs(
title = "A typical 7 days for sex crimes against children",
subtitle = numcrimesweektext,
)
# make theme prettier
bubbleplot <- bubbleplot +
theme(
plot.title = element_text( # font size "large"
size = 20,
hjust = 0, vjust = 1,
margin = margin(b = 15/2)
),
plot.subtitle = element_text( # font size "regular"
size = 15,
hjust = 0, vjust = 1,
margin = margin(b = 15/2)
)
)
# show the figure
bubbleplot
})
############ bubble plot hover text ##############
output$bubbleplot_info <- renderUI({
if(length(str_subset(input$includebubble, "include_notlisted")) > 0) {
big_category_counts_summary_w <- big_category_counts_bubble_w |>
distinct(Category, .keep_all = TRUE)
} else {
big_category_counts_summary_w <- big_category_counts_bubble_w |>
distinct(Category, .keep_all = TRUE) |>
#big_category_counts_summary <- big_category_counts_summary |>
filter(Category != "Not Listed")
}
# Create data, group by big categories, sort by size and alpha (in case of size ties)
weekdata <- data.frame(group=paste(big_category_counts_summary_w$Category_n, big_category_counts_summary_w$Category,sep="\n"),
value=big_category_counts_summary_w$Category_n,
category_color=big_category_counts_summary_w$category_color,
text_color=big_category_counts_summary_w$text_color,
hovertextinfo=big_category_counts_summary_w$Category) |>
arrange(desc(big_category_counts_summary_w$Category_n), big_category_counts_summary_w$Category)
# Generate the layout. This function return a dataframe with one line per bubble.
# It gives its center (x and y) and its radius, proportional of the value
packing <- circleProgressiveLayout(weekdata$value, sizetype='area')
# We can add these packing information to the initial data frame
weekdata_p <- cbind(weekdata, packing)
# capture location of mouse hovering after it pauses
hover_bubble <- input$bubbleplot_hover
if(!is.null(hover_bubble)){ # mouse is in bounds
left_px_bubble <- hover_bubble$x # for chart coordinates
top_px_bubble <- hover_bubble$y
left_css_px_bubble <- hover_bubble$coords_css$x # for position of tooltip
top_css_px_bubble <- hover_bubble$coords_css$y
# if mouse is on left of graph, put tool tips to right. if on right put on left
style <- paste0("position:absolute; z-index:100; background-color: rgba(245, 245, 245, 0.85); ",
"left:", left_css_px_bubble, "px; top:", top_css_px_bubble-700, "px;")
groupval <- NULL
for(i in 1:length(weekdata_p$radius)){
if(sqrt((left_px_bubble - weekdata_p$x[i])^2 + (top_px_bubble - weekdata_p$y[i])^2) < weekdata_p$radius[i]){
groupval <- weekdata_p$hovertextinfo[i]
}
}
# construct names hover info
if(!is.null(groupval)){
hovertext_bubble <- big_category_counts_bubble |>
filter(Category == groupval) |>
ungroup() |>
select(Relation) |>
distinct() |>
sample_n(size=10)
# remove entries with numbers, gets weird.
hovertext_bubble <- hovertext_bubble[!str_detect(hovertext_bubble$Relation, '[0-9]'),]
hovertext_str <- NULL
if(dim(hovertext_bubble)[1] == 1){
hovertextformatted <- paste("<b>People listed in ",groupval," only include:</b><br />★ ", hovertext_bubble$Relation[1], sep="")
} else if(dim(hovertext_bubble)[1] < 6) {
for(i in seq(1, dim(hovertext_bubble)[1])){
#for(i in seq(1, 10)){
hovertext_sub <- paste("★ ",hovertext_bubble$Relation[i],"<br />", sep="")
hovertext_str <- paste(hovertext_str, hovertext_sub, sep="")
}
hovertextformatted <- paste("<b>People listed in ",groupval," included:</b>", hovertext_str, sep="<br />")
} else if(dim(hovertext_bubble)[1] >= 6) {
for(i in seq(1, dim(hovertext_bubble)[1], 2)){
#for(i in seq(1, 10, 2)){
if(i + 1 > dim(hovertext_bubble)[1]){
#if(i + 1 > 10){
hovertext_sub <- paste("★ ",hovertext_bubble$Relation[i], sep="")
hovertext_str <- paste(hovertext_str, hovertext_sub, sep="")
} else {
hoversub1 <- hovertext_bubble$Relation[i]
hoversub1 <- paste("★ ",hoversub1, ", ", sep="")
hoversub2 <- hovertext_bubble$Relation[i+1]
hoversub2 <- paste("★ ",hoversub2,",<br />", sep="")
hovertext_str <- paste(hovertext_str, hoversub1 , hoversub2, sep="")
}
}
#hovertext_str <- hovertext_sub
#hovertext_str <- str_replace_all(hovertext_str, ";","<br />* ")
hovertextformatted <- paste("<b>People listed in ",groupval," included:</b>", hovertext_str, sep="<br />")
}
crimes_on_date_text <- hovertextformatted
wellPanel(
style = style,
p(HTML(crimes_on_date_text))
)
}
}
})
######################## About - cumulative sum of data ###############################
# fill in zeros for days with no crimes, create a df ready for time series
eventsperday <- wmn_data_previous |>
group_by(Date) |>
summarize(perday = n()) |>
complete(Date = seq.Date(min(Date), max(Date), by = "days"),
fill = list(perday = 0)) |>
mutate(cum_sum = cumsum(perday)) |>
arrange(Date)
##############
# Linear model
##############
# get the last month's data, count, and make a linear regression
sumduration = 30
numEventsperday_reg <- eventsperday |>
filter(Date > ymd("2023-07-14"))
#filter(Date > max(Date) - duration(sumduration, 'days'))
#textlabel <- "Linear model"
cumsum30.lm = lm(cum_sum ~ Date, data=numEventsperday_reg)
# p-value
cumsum_p <- sorted_p(cumsum30.lm)
cumsum_ar <- summary(cumsum30.lm)$adj.r.squared
# slope, intercept
cumsum_m <- cumsum30.lm$coefficients[2]
cumsum_b <- cumsum30.lm$coefficients[1]
cumsumlineartext <- "Above, predicted crimes based on linear regression since July 14th, 2023, when sampling appeared to change. The dark red dotted line represents the regression, with"
cumsumlineartext <- paste(cumsumlineartext, " slope = ", round(cumsum_m,2), " crimes per day, and r<sup>2</sup> = ", round(cumsum_ar,4), " (no hover data on this plot).<br /><br />Interestingly, it seems that the crime rate has held steady for a lengthy time period - is there just a 'background' rate of crimes against children that occurs? It would take multiple years to determine if the 'wobble' in the line is due to seasonal effects on crime rates, but that would be interesting!", sep='')
cumsumlineartext <- paste(cumsumlineartext, "<br /><br />Below, the errors of the linear prediction, per day (hover for actual and predicted values).")
nextcrimetext <- paste("Number of crimes predicted for ",strftime(now(), "%A, %B %d, %Y")," is: ",round(cumsum_m, 0), sep="")
# get limits for error line plots
ylim_low <- 999
ylim_high <- -999
# get errors for line plot
linear_error <- all_errors_longer |>
filter(model == "lin_diffcrimes")
linear_over_predict <- linear_error |>
filter(value > 0)
if(max(linear_over_predict$value) > ylim_high){ylim_high <- max(linear_over_predict$value)}
linear_under_predict <- linear_error |>
filter(value < 0)
if(min(linear_under_predict$value) < ylim_low){ylim_low <- min(linear_under_predict$value)}
linear_well_predict <- linear_error |>