Skip to content

Commit

Permalink
Merge pull request #304 from PSLmodels/pr-initial-setup-prepare-state…
Browse files Browse the repository at this point in the history
…-targets

Pr initial setup prepare state targets
  • Loading branch information
donboyd5 authored Nov 20, 2024
2 parents 7003dd9 + 5ace6db commit db3e810
Show file tree
Hide file tree
Showing 17 changed files with 3,910 additions and 0 deletions.
1 change: 1 addition & 0 deletions tmd/areas/targets/prepare/prepare_states/.Rprofile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
source("renv/activate.R")
37 changes: 37 additions & 0 deletions tmd/areas/targets/prepare/prepare_states/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# prepare_states

# ignore quarto at the root level
/.quarto/

# folders to ignore (anywhere in project since not preceded by root /)
.Rproj.user/
#_docs/
#_targetprep/
#_freeze/
_web/
#libs/
# Local Netlify folder
.netlify

# Ignore `renv` directories that are system-specific
renv/library/
renv/cache/

# Track the lockfile and settings file for reproducibility
!renv.lock
!renv/settings.dcf

# file types to ignore (unless not ignored elsewhere)
~*
*.csv
*.html
*.rds

# specific files to ignore regardless of how file types are treated
.Rhistory

# Do not ignore anything in raw data folder, including files or folders nested within
!states/raw_data/**
# but continue to ignore Word temp files
states/raw_data/~*

17 changes: 17 additions & 0 deletions tmd/areas/targets/prepare/prepare_states/R/constants.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

CDZIPURL <- "https://www.irs.gov/pub/irs-soi/congressional2021.zip"
CDDOCURL <- "https://www.irs.gov/pub/irs-soi/21incddocguide.docx"

CDDIR <- here::here("cds")
CDRAW <- fs::path(CDDIR, "raw_data")
CDINTERMEDIATE <- fs::path(CDDIR, "intermediate")
CDFINAL <- fs::path(CDDIR, "final")

CDDOCEXTRACT <- "cd_documentation_extracted_from_21incddocguide.docx.xlsx"

TMDHOME <- fs::path(here::here(), "..", "..", "..", "..", "..")
# normalizePath(TMDHOME)
TMDDATA <- fs::path(TMDHOME, "tmd", "storage", "output")
# normalizePath(TMDDATA)

CDAGICUTS <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf)
11 changes: 11 additions & 0 deletions tmd/areas/targets/prepare/prepare_states/R/functions.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@


ht <- function(df, nrecs = 6) {
print(utils::head(df, nrecs))
print(utils::tail(df, nrecs))
}

ns <- function(obj){
sort(names(obj))
}

82 changes: 82 additions & 0 deletions tmd/areas/targets/prepare/prepare_states/R/libraries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# libraries ---------------------------------------------------------------

library(renv)
library(here)

library(DT)
library(fs)
library(gt)
library(knitr)
library(readxl)
library(skimr)
library(stringr)
library(tidyverse)
# includes: dplyr, forcats, ggplot2, lubridate, purrr, stringr, tibble, tidyr

tprint <- 75 # default tibble print
options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows

library(vroom)

# census_api_key("b27cb41e46ffe3488af186dd80c64dce66bd5e87", install = TRUE) # stored in .Renviron
# libraries needed for census population
library(sf)
library(tidycensus)
library(tigris)
options(tigris_use_cache = TRUE)


# possible libraries ------------------------------------------------------

# library(rlang)
# library(tidyverse)
# tprint <- 75 # default tibble print
# options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows
#
# library(fs)

# tools
# library(vroom)
# library(readxl)
# library(openxlsx) # for writing xlsx files
# library(lubridate)
# library(RColorBrewer)
# library(RcppRoll)
# library(fredr)
# library(tidycensus)
# library(googledrive)
# library(arrow)
#
# library(jsonlite)
# library(tidyjson)
#
#
# # boyd libraries
# # library(btools)
# # library(bdata)
# # library(bggtools)
# # library(bmaps)
#
# # graphics
# library(scales)
# library(ggbeeswarm)
# library(patchwork)
# library(gridExtra)
# library(ggrepel)
# library(ggbreak)
#
# # tables
# library(knitr)
# library(kableExtra)
# library(DT)
# library(gt)
# library(gtExtras)
# library(janitor)
# library(skimr)
# library(vtable)
#
# # maps
# library(maps)
# # https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html
# library(usmap)

78 changes: 78 additions & 0 deletions tmd/areas/targets/prepare/prepare_states/_quarto.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
project:
type: book
output-dir: _web

# https://prerelease.quarto.org/ # quarto documentation at this link


# site info:
# OLD id: 4d646266-9d1f-4d69-acb4-b9a17b63a5ff
# Unique deploy URL: https://671e13320a7e7cfb68b1ba7d--tmd-areas-prepare-targets.netlify.app
# url: https://tmd-areas-targets-prepare.netlify.app

# publishing with netlify cli:
# open terminal in prepare
# quarto render && netlify deploy --prod --dir=_targetprep

# quarto render # inspect to be sure it is as desired
# netlify deploy --prod --dir=_targetprep

# or step by step
# netlify deploy # to test it, give _examine as publish directory
# netlify deploy --prod # to deploy, give _docs as publish directory

execute:
eval: true
echo: true
output: true
freeze: false # auto: during global project renders, re-render only when source changes

book:
title: "Develop targets for States"
subtitle: "Create csv target files for use by area targeting routines"
# author: "Don Boyd"
date: today
date-format: long
chapters:
- index.qmd
- part: "Usage"
chapters:
- usage.qmd
- part: "IRS SOI State data"
chapters:
# - cd_download_and_clean_census_population_data.qmd
- download_soi_data.qmd
- explore_soi_data.qmd
# - cd_construct_soi_variable_documentation.qmd
# - cd_construct_long_soi_data_file.qmd
# - cd_create_basefile_for_117Congress_cd_target_files.qmd
# - cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd
# - cd_create_basefile_multiple_sessions.qmd
# - cd_create_variable_mapping.qmd
# - cd_compare_us_totals_tmd_vs_irs_published.qmd
# - cd_enhance_basefile_with_special_targets.qmd
# appendices:
# - cd_issues_and_TODOs.qmd
# - cd_IRS_documentation.qmd

format:
html:
theme: cosmo
code-fold: true

editor_options:
chunk_output_type: console

# rendering commands
# quarto render
# quarto publish netlify --no-prompt --no-render --no-browser

# possibly use this at start of each doc
# ---
# output: html_document
# editor_options:
# chunk_output_type: console
# ---



Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
---
output: html_document
editor_options:
chunk_output_type: console
---

# Get Congressional District Census population data

## Setup

```{r}
#| label: setup
source(here::here("R", "libraries.R"))
source(here::here("R", "constants.R"))
source(here::here("R", "functions.R"))
# 334283385.27000004 national pop
```

## Get Congressional District population

The quarto chunk `cdpop-download` will, if a user sets the chunk `eval` option to `eval: true`, download and save Congressional District population data, based on the 116th Congress, from the American Community Survey (ACS) for 2021.

Ordinarily this will not be necessary because previously downloaded files are included with the project in the "../cds/raw_data" folder. Thus, the default chunk option is `eval: false` and the chunk will not be run when this project is rendered.

```{r}
#| label: cdpop-download
#| eval: false
# Note that 2021 gets data from the 116th Congress and 2022 gets the 218th Congress
cdpop1year <- get_acs(
geography = "congressional district",
variables = "B01003_001", # Total population variable
year = 2021,
survey = "acs1" # Consider using 5-year estimates for better coverage, especially in smaller areas
)
write_csv(cdpop1year, fs::path(CDRAW, "cdpop1year_acs.csv"))
# Optionally get 5-year ACS data - possibly useful in the future.
# cdpop5year <- get_acs(
# geography = "congressional district",
# variables = "B01003_001", # Total population variable
# year = 2021,
# survey = "acs5" # Use 5-year estimates for better coverage, especially in smaller areas
# )
# write_csv(cdpop5year, fs::path(CDRAW, "cdpop5year_acs.csv"))
```

## Clean Congressional District population

This chunk gets previously saved Congressional District population data, does minor cleaning, and saves the cleaned file to the "../cds/intermediate" folder

```{r}
#| label: cdpop-clean
#| output: false
cdpop1year <- read_csv(fs::path(CDRAW, "cdpop1year_acs.csv"))
cdpop1year |> summarise(estimate=sum(estimate)) # 335157329
# cdpop5year <- read_csv(fs::path(CDRAW, "cdpop5year_acs.csv"))
# cdpop5year |> summarise(estimate=sum(estimate)) # 333036755
stcodes <- tigris::states() |>
as.data.frame() |>
select(STATEFIPS=STATEFP, STATE=STUSPS)
cdpop1 <- cdpop1year |>
mutate(STATEFIPS = str_sub(GEOID, 1, 2),
CONG_DISTRICT = str_sub(GEOID, 3, 4)) |>
left_join(stcodes, by = join_by(STATEFIPS)) |>
filter(STATE != "PR") |> # we're not using Puerto Rico
mutate(CONG_DISTRICT = ifelse(STATE == "DC",
"00", # Census data has 98 for DC
CONG_DISTRICT)) |>
select(STATEFIPS, STATE, CONG_DISTRICT, cdname=NAME, pop2021=estimate)
count(cdpop1, STATEFIPS, STATE)
cdpop1 |> filter(STATE=="NY")
cdpop1 |> filter(STATE=="AK")
cdpop1 |> filter(STATE=="DC")
cdpop1 |> filter(STATE=="WY")
count(cdpop1, CONG_DISTRICT)
count(cdpop1, STATEFIPS, STATE)
glimpse(cdpop1)
sum(cdpop1$pop2021) # 331,893,745 compared to Martin's 334,283,385
write_csv(cdpop1, fs::path(CDINTERMEDIATE, "cdpop1year.csv"))
```
5 changes: 5 additions & 0 deletions tmd/areas/targets/prepare/prepare_states/data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Ignore everything in this directory
*

# Allow the .gitignore file itself
!.gitignore
Loading

0 comments on commit db3e810

Please sign in to comment.