Skip to content

Commit

Permalink
Merge pull request #301 from PSLmodels/pr-examine-tmd-vs-CD-US-totals
Browse files Browse the repository at this point in the history
PR examine tmd vs cd us totals
  • Loading branch information
donboyd5 authored Nov 18, 2024
2 parents 2db8e5f + 5b386ae commit f1c7c21
Show file tree
Hide file tree
Showing 19 changed files with 456 additions and 1,273 deletions.
36 changes: 0 additions & 36 deletions tmd/areas/targets/ny21_targets.csv
Original file line number Diff line number Diff line change
@@ -1,14 +1,5 @@
varname,count,scope,agilo,agihi,fstatus,target
XTOT,0,0,-9e99,9e99,0,778806.9963618957
c00100,1,1,-9e99,1,0,7783.30061591544
c00100,1,1,1,10000,0,34845.00467650245
c00100,1,1,10000,25000,0,58872.665215289446
c00100,1,1,25000,50000,0,92073.22923991714
c00100,1,1,50000,75000,0,57122.387347592645
c00100,1,1,75000,100000,0,36860.296870916754
c00100,1,1,100000,200000,0,55175.53757452468
c00100,1,1,200000,500000,0,12676.61287259154
c00100,1,1,500000,9e99,0,2380.3685403844206
c00100,0,1,1,10000,0,182423342.11336407
c00100,0,1,10000,25000,0,1037335916.1426524
c00100,0,1,25000,50000,0,3369386206.2984276
Expand All @@ -17,33 +8,6 @@ c00100,0,1,75000,100000,0,3192887737.1752
c00100,0,1,100000,200000,0,7444391217.009996
c00100,0,1,200000,500000,0,3533125242.646242
c00100,0,1,500000,9e99,0,3512409391.634545
c00100,1,1,-9e99,1,1,5025.5683979292535
c00100,1,1,1,10000,1,28500.37375066762
c00100,1,1,10000,25000,1,40999.78237965522
c00100,1,1,25000,50000,1,55431.39047078707
c00100,1,1,50000,75000,1,27828.250423235033
c00100,1,1,75000,100000,1,11313.918480070117
c00100,1,1,100000,200000,1,8421.1356665729
c00100,1,1,200000,500000,1,1666.754799870258
c00100,1,1,500000,9e99,1,375.3116805477707
c00100,1,1,-9e99,1,2,1474.965149593628
c00100,1,1,1,10000,2,3227.4938250458044
c00100,1,1,10000,25000,2,7784.831857302671
c00100,1,1,25000,50000,2,17620.627308577044
c00100,1,1,50000,75000,2,20112.27469996074
c00100,1,1,75000,100000,2,21629.717785468103
c00100,1,1,100000,200000,2,43900.36435388441
c00100,1,1,200000,500000,2,10580.306783344742
c00100,1,1,500000,9e99,2,1903.525763605093
c00100,1,1,-9e99,1,4,1041.8053348183391
c00100,1,1,1,10000,4,2545.6814968580734
c00100,1,1,10000,25000,4,8850.43355733606
c00100,1,1,25000,50000,4,16010.696051542369
c00100,1,1,50000,75000,4,6907.72695791437
c00100,1,1,75000,100000,4,2875.8964103872336
c00100,1,1,100000,200000,4,1963.7019740835344
c00100,1,1,200000,500000,4,297.62913704836967
c00100,1,1,500000,9e99,4,61.76571232759558
e00200,0,1,-9e99,1,0,28535896.06329254
e00200,0,1,1,10000,0,132417077.07366145
e00200,0,1,10000,25000,0,620330137.749753
Expand Down
5 changes: 3 additions & 2 deletions tmd/areas/targets/prepare/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@ book:
- cd_construct_long_soi_data_file.qmd
- cd_create_basefile_for_117Congress_cd_target_files.qmd
- cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd
- cd_enhance_basefile_with_special_targets.qmd
- cd_create_basefile_multiple_sessions.qmd
- cd_create_variable_mapping.qmd
# - cd_map_tcvars_and_extract_target_files.qmd
- cd_compare_us_totals_tmd_vs_irs_published.qmd
- cd_enhance_basefile_with_special_targets.qmd
appendices:
- cd_issues_and_TODOs.qmd
- cd_IRS_documentation.qmd
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
---
output: html_document
editor_options:
chunk_output_type: console
---

# Compare U.S. totals of mapped variables, tax-microdata-benchmarking vs. IRS published CD values


## Setup
```{r}
#| label: setup
source(here::here("R", "libraries.R"))
source(here::here("R", "constants.R"))
source(here::here("R", "functions.R"))
```

## Get data

```{r}
#| label: get-data
#| output: false
vmap <- read_csv(fs::path(CDINTERMEDIATE, "cd_variable_mapping.csv"))
cdirs <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv"))
TMDDIR <- here::here("..", "..", "..", "storage", "output")
fpath <- fs::path(TMDDIR, "cached_allvars.csv")
tmd2021 <- vroom(fpath)
ns(tmd2021)
```

## Create comparison file

Prepare Congressional district data.

```{r}
#| label: prepare-cd-data
#| output: false
count(cdirs, count)
cd2 <- cdirs |>
filter(basevname %in% vmap$basevname,
session==118,
scope==1 | basevname=="XTOT")
glimpse(cd2)
count(cd2, count)
count(cd2, basevname, vname)
count(cd2, rectype)
count(cd2, scope)
count(cd2, fstatus)
count(cd2 |> filter(str_detect(vname, "MARS")),
vname, fstatus)
skim(cd2)
cd_adjusted <- cd2 |>
summarise(target=sum(target),
.by=c(basevname, vname, scope, count, fstatus, agistub, agirange, description))
```

Prepare tmd data

```{r}
#| label: prepare-tmd-data
#| eval: true
#| output: false
# 0 = Total
# 1 = Under $1
# 2 = $1 under $10,000
# 3 = $10,000 under $25,000
# 4 = $25,000 under $50,000
# 5 = $50,000 under $75,000
# 6 = $75,000 under $100,000
# 7 = $100,000 under $200,000
# 8 = $200,000 under $500,000
# 9 = $500,000 or more
agicuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf)
variables_to_sum <- ifelse(str_starts(vmap$basevname, "MARS"),
vmap$basevname,
vmap$varname)
tmd2 <- tmd2021 |>
mutate(scope=ifelse(data_source==0, 2, 1),
MARS1=MARS==1, MARS2=MARS==2, MARS4=MARS==4,
agistub=cut(c00100, agicuts, right = FALSE, ordered_result = TRUE) |>
as.integer()) |>
summarize(across(all_of(variables_to_sum),
list(amount = \(x) sum(x * s006),
nzcount = \(x) sum((x!=0) * s006),
allcount= \(x) sum(s006))),
.by=c(scope, agistub)) |>
arrange(scope, agistub)
# look at nzcounts
tmd2 |> select(scope, agistub, contains("nzcount"))
tmd2 |> select(scope, agistub, contains("allcount"))
tmd2 |> select(scope, agistub, contains("amount"))
tmd2 |>
filter(scope==1) |>
select(scope, agistub, contains("c00100") & contains("count")) |>
janitor::adorn_totals()
# flip around and get count
tmd3 <- tmd2 |>
pivot_longer(cols=-c(scope, agistub), values_to = "wtdvalue") |>
separate_wider_delim(cols=name, delim="_", names=c("varname", "type")) |>
mutate(count = case_when(
type == "amount" ~ 0,
type == "nzcount" ~ 2,
type == "allcount" ~ 1,
.default = -9e9))
count(tmd3, count, type)
tmd3 |> filter(count==2, agistub==1) # looks good
tmd3 |> filter(count==1, agistub==4)
# 37,694,755 is the bad val allcount
# separate the mars and nonmars variables to get proper mars values
tmdxmars <- tmd3 |>
filter(str_detect(varname, "MARS", negate = TRUE)) |>
filter(!(varname=="XTOT" & type != "amount")) |>
mutate(fstatus=0) |>
select(varname, scope, fstatus, count, agistub, wtdvalue) |>
arrange(varname, scope, fstatus, count, agistub)
# tmdxmars |> filter(varname=="e00200", agistub==4) # we'll want count==2 for counts -- nonzero counts
# check tmd mars totals
tmd3 |>
filter(str_detect(varname, "MARS")) |>
summarize(wtdvalue=sum(wtdvalue),
.by=c(varname, scope, type, count))
# this helps verify that we want type=="amount" as the number of returns
# nothing else is useful
# CAUTION: to be consistent with the xxxx_targets.csv file format rules
# we need to set count to 1 (number of units with any value) INSTEAD of zero
# and by convention we'll use c00100 as the variable name but any variable would be the same
tmdmars <- tmd3 |>
filter(str_detect(varname, "MARS"),
type=="amount") |>
mutate(fstatus=str_sub(varname, -1) |>
as.integer(),
varname="c00100") |>
select(varname, scope, fstatus, count, agistub, wtdvalue) |>
arrange(varname, scope, fstatus, count, agistub)
# combine files and concatenate totals across agi ranges
tmdsums1 <- bind_rows(tmdxmars, tmdmars)
tmd_agitots <- tmdsums1 |>
summarise(wtdvalue=sum(wtdvalue),
.by=c(varname, scope, fstatus, count)) |>
mutate(agistub=0) |>
bind_rows(tmdsums1)
# concatenate totals across scopes
tmd_scopes <- tmd_agitots |>
summarise(wtdvalue=sum(wtdvalue),
.by=c(varname, agistub, fstatus, count)) |>
mutate(scope=0) |>
bind_rows(tmd_agitots) |>
select(varname, scope, fstatus, count, agistub, wtdvalue) |>
arrange(varname, scope, fstatus, count, agistub)
# we need to put fstatus on vmap for proper merging
vmap2 <- vmap |>
mutate(fstatus=case_when(basevname=="MARS1" ~ 1,
basevname=="MARS2" ~ 2,
basevname=="MARS4" ~ 4,
.default = 0) |>
as.integer())
tmd_adjusted <- tmd_scopes |>
left_join(vmap2,
by = join_by(varname, fstatus))
```


## Prepare comparison file

```{r}
#| label: prepare-compare
#| eval: true
#| output: false
comp <- tmd_adjusted |>
select(-description) |>
inner_join(cd_adjusted,
by = join_by(scope, fstatus, count, agistub, basevname)) |>
relocate(wtdvalue, .after = target) |>
mutate(diff=wtdvalue - target,
pdiff=diff / target)
summary(comp)
skim(comp)
write_csv(comp, fs::path(CDINTERMEDIATE, "cd_tmd_irs_compare.csv"))
```

## Explore comparisons file

```{r}
#| label: explore-compare
#| eval: true
#| output: false
comp <- read_csv(fs::path(CDINTERMEDIATE, "cd_tmd_irs_compare.csv"))
comp |>
arrange(desc(abs(pdiff)))
badmatches <- c("e18400", "e18500", "e02400") # variables where tmd and IRS concepts are not well aligned
badmatches <- c("e18400", "e18500", "e02400", "e26270") # to make it easier to examine other variables
check <- comp |>
filter(!varname %in% badmatches, count==2) |>
arrange(desc(abs(pdiff)))
tmd2021 |>
filter(data_source==1, c00100 != 0) |>
summarise(n=sum(s006)) # 160850840
temp2 <- tmd_adjusted |> filter(fstatus==0, agistub==0, varname=="c00100", scope==1)
temp2 |> gt() |> fmt_number(wtdvalue, decimals=0)
# count 1 161,696,687
# count 2 160,850,840
temp <- comp |>
filter(varname=="c00100", scope==1, agistub==0, count==2)
# target 157375370 value 160850840
comp |>
filter(!varname %in% badmatches) |>
arrange(desc(abs(pdiff)))
comp |>
filter(!varname %in% badmatches, count==1) |>
arrange(desc(wtdvalue))
comp |>
filter(!varname %in% badmatches, count==1, agistub==4) |>
arrange(desc(wtdvalue))
check <- comp |>
filter(!varname %in% badmatches, count==0) |>
arrange(desc(abs(pdiff)))
check |>
filter(agistub==0)
check |>
filter(agistub==9)
verybad <- check |>
filter(abs(pdiff) >= .3)
verybad
verybad |>
filter(agistub==0)
# Lessons:
# - e26270 Partnership / S Corp looks like it could be a conceptual mismatch?? Or some other problem in concept?
# agistub 0 is within 1.5% but ranges are way off
# should we create a shared-down variable?
# - e00300 taxable interest seems a little off
```

Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ This involves cleaning the SOI Congressional District data, adding agi bin infor
source(here::here("R", "libraries.R"))
source(here::here("R", "constants.R"))
source(here::here("R", "functions.R"))
# 334283385.27000004 national pop
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ This section creates one long file that is a superset of what we need for indivi
source(here::here("R", "libraries.R"))
source(here::here("R", "constants.R"))
source(here::here("R", "functions.R"))
# 334283385.27000004 national pop
Expand Down Expand Up @@ -123,7 +124,7 @@ cdlong4 <- cdlong3 |>
.default = 0),
count = case_when(
vtype == "count" ~ 1,
vtype == "count" ~ 2, # correction - used to be 1
vtype == "amount" ~ 0,
.default = -99),
Expand Down
Loading

0 comments on commit f1c7c21

Please sign in to comment.