From 863f82005f6e43ad56850b33fc0024337e1dfcd1 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 05:39:57 -0500 Subject: [PATCH 01/33] update log files --- tmd/areas/weights/ny21.log | 61 ++++++++++++++++++++++++ tmd/areas/weights/xx.log | 96 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 tmd/areas/weights/ny21.log create mode 100644 tmd/areas/weights/xx.log diff --git a/tmd/areas/weights/ny21.log b/tmd/areas/weights/ny21.log new file mode 100644 index 00000000..c113cbe2 --- /dev/null +++ b/tmd/areas/weights/ny21.log @@ -0,0 +1,61 @@ +CREATING WEIGHTS FILE FOR AREA ny21 ... +INITIAL WEIGHTS STATISTICS: +sum of national weights = 1.840247e+08 +area weights_scale = 2.329781e-03 +USING ny21_targets.csv FILE WITH 82 TARGETS +ASSUMING TARGET_RATIO_TOLERANCE = 0.004000 +DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=82): +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.400000, < 0.800000: 4 4 4.88% 4.88% +>= 0.800000, < 0.900000: 3 7 3.66% 8.54% +>= 0.900000, < 0.990000: 12 19 14.63% 23.17% +>= 0.990000, < 0.996000: 0 19 0.00% 23.17% +>= 0.996000, < 1.004000: 2 21 2.44% 25.61% +>= 1.004000, < 1.010000: 1 22 1.22% 26.83% +>= 1.010000, < 1.100000: 9 31 10.98% 37.80% +>= 1.100000, < 1.200000: 9 40 10.98% 48.78% +>= 1.200000, < 1.600000: 14 54 17.07% 65.85% +>= 1.600000, < 2.000000: 12 66 14.63% 80.49% +>= 2.000000, < 3.000000: 14 80 17.07% 97.56% +>= 3.000000, < 4.000000: 2 82 2.44% 100.00% +MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.669 +MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 3.083 +US_PROPORTIONALLY_SCALED_TARGET_RMSE= 7.375708784e-01 +target_matrix sparsity ratio = 0.929 +OPTIMIZE WEIGHT RATIOS POSSIBLY IN A REGULARIZATION LOOP + where initial REGULARIZATION DELTA value is 1.000000e-09 + and there is only one REGULARIZATION LOOP + and where target_matrix.shape= (225256, 82) + ::loop,delta,misses: 1 1.000000e-09 0 +>>> final delta loop iterations=407 success=True +>>> message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH +>>> L-BFGS-B optimized objective function value: 3.814234922e-05 +AREA-OPTIMIZED_TARGET_MISSES= 0 +DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=82): + with REGULARIZATION_DELTA= 1.000000e-09 +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.996000, < 1.004000: 82 82 100.00% 100.00% +MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.999 +MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.002 +AREA-OPTIMIZED_TARGET_RMSE= 3.582120066e-04 +DISTRIBUTION OF AREA/US WEIGHT RATIO (n=225256): + with REGULARIZATION_DELTA= 1.000000e-09 +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.000000, < 0.000001: 27 27 0.01% 0.01% +>= 0.000001, < 0.100000: 9749 9776 4.33% 4.34% +>= 0.100000, < 0.200000: 4139 13915 1.84% 6.18% +>= 0.200000, < 0.500000: 16007 29922 7.11% 13.28% +>= 0.500000, < 0.800000: 34755 64677 15.43% 28.71% +>= 0.800000, < 0.850000: 12872 77549 5.71% 34.43% +>= 0.850000, < 0.900000: 18966 96515 8.42% 42.85% +>= 0.900000, < 0.950000: 19631 116146 8.71% 51.56% +>= 0.950000, < 1.000000: 35814 151960 15.90% 67.46% +>= 1.000000, < 1.050000: 31899 183859 14.16% 81.62% +>= 1.050000, < 1.100000: 13056 196915 5.80% 87.42% +>= 1.100000, < 1.150000: 8547 205462 3.79% 91.21% +>= 1.150000, < 1.200000: 6076 211538 2.70% 93.91% +>= 1.200000, < 2.000000: 13112 224650 5.82% 99.73% +>= 2.000000, < 5.000000: 603 225253 0.27% 100.00% +>= 5.000000, < 10.000000: 2 225255 0.00% 100.00% +>= 10.000000, < 100.000000: 1 225256 0.00% 100.00% +RMSE OF AREA/US WEIGHT RATIO DEVIATIONS FROM ONE = 3.501686e-01 diff --git a/tmd/areas/weights/xx.log b/tmd/areas/weights/xx.log new file mode 100644 index 00000000..c5e829a4 --- /dev/null +++ b/tmd/areas/weights/xx.log @@ -0,0 +1,96 @@ +CREATING WEIGHTS FILE FOR AREA xx ... +USING CUSTOMIZED PARAMETERS IN xx_params.yaml +INITIAL WEIGHTS STATISTICS: +sum of national weights = 1.840247e+08 +area weights_scale = 9.871864e-02 +USING xx_targets.csv FILE WITH 16 TARGETS +ASSUMING TARGET_RATIO_TOLERANCE = 0.004000 +TARGET001:ACT-EXP,ACT/EXP= 2.220446049e-16, 1.000 +TARGET002:ACT-EXP,ACT/EXP= -4.889331414e-01, 0.511 +TARGET003:ACT-EXP,ACT/EXP= 3.543663558e-01, 1.354 +TARGET004:ACT-EXP,ACT/EXP= -1.091439669e-01, 0.891 +TARGET005:ACT-EXP,ACT/EXP= 2.198130460e+00, 3.198 +TARGET006:ACT-EXP,ACT/EXP= 3.160928471e-01, 1.316 +TARGET007:ACT-EXP,ACT/EXP= 2.174874017e-01, 1.217 +TARGET008:ACT-EXP,ACT/EXP= 7.650046591e+00, 8.650 +TARGET009:ACT-EXP,ACT/EXP= 1.741027627e+00, 2.741 +TARGET010:ACT-EXP,ACT/EXP= 3.776025414e+00, 4.776 +TARGET011:ACT-EXP,ACT/EXP= 3.173963684e+00, 4.174 +TARGET012:ACT-EXP,ACT/EXP= 6.060076711e+00, 7.060 +TARGET013:ACT-EXP,ACT/EXP= 1.062016098e+00, 2.062 +TARGET014:ACT-EXP,ACT/EXP= 2.244594909e+00, 3.245 +TARGET015:ACT-EXP,ACT/EXP= 3.395899895e+00, 4.396 +TARGET016:ACT-EXP,ACT/EXP= 1.232882634e+00, 2.233 +DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=16): +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.400000, < 0.800000: 1 1 6.25% 6.25% +>= 0.800000, < 0.900000: 1 2 6.25% 12.50% +>= 0.900000, < 0.990000: 0 2 0.00% 12.50% +>= 0.990000, < 0.996000: 0 2 0.00% 12.50% +>= 0.996000, < 1.004000: 1 3 6.25% 18.75% +>= 1.004000, < 1.010000: 0 3 0.00% 18.75% +>= 1.010000, < 1.100000: 0 3 0.00% 18.75% +>= 1.100000, < 1.200000: 0 3 0.00% 18.75% +>= 1.200000, < 1.600000: 3 6 18.75% 37.50% +>= 1.600000, < 2.000000: 0 6 0.00% 37.50% +>= 2.000000, < 3.000000: 3 9 18.75% 56.25% +>= 3.000000, < 4.000000: 2 11 12.50% 68.75% +>= 4.000000, < 5.000000: 3 14 18.75% 87.50% +>= 5.000000, < inf: 2 16 12.50% 100.00% +MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.511 +MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 8.650 +US_PROPORTIONALLY_SCALED_TARGET_RMSE= 3.032996557e+00 +target_matrix sparsity ratio = 0.597 +OPTIMIZE WEIGHT RATIOS POSSIBLY IN A REGULARIZATION LOOP + where initial REGULARIZATION DELTA value is 1.000000e-09 + and there is only one REGULARIZATION LOOP + and where target_matrix.shape= (225256, 16) + ::loop,delta,misses: 1 1.000000e-09 0 +>>> final delta loop iterations=152 success=True +>>> message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH +>>> L-BFGS-B optimized objective function value: 1.253966268e-04 +AREA-OPTIMIZED_TARGET_MISSES= 0 +TARGET001:ACT-EXP,ACT/EXP= 5.883669201e-06, 1.000 +TARGET002:ACT-EXP,ACT/EXP= -1.257745513e-04, 1.000 +TARGET003:ACT-EXP,ACT/EXP= -3.035473917e-05, 1.000 +TARGET004:ACT-EXP,ACT/EXP= 1.817088750e-04, 1.000 +TARGET005:ACT-EXP,ACT/EXP= -1.960859660e-07, 1.000 +TARGET006:ACT-EXP,ACT/EXP= 5.549283742e-05, 1.000 +TARGET007:ACT-EXP,ACT/EXP= -3.271301651e-04, 1.000 +TARGET008:ACT-EXP,ACT/EXP= -6.091260532e-06, 1.000 +TARGET009:ACT-EXP,ACT/EXP= -1.460987897e-06, 1.000 +TARGET010:ACT-EXP,ACT/EXP= 9.247871540e-05, 1.000 +TARGET011:ACT-EXP,ACT/EXP= -3.324367229e-05, 1.000 +TARGET012:ACT-EXP,ACT/EXP= -2.624945468e-05, 1.000 +TARGET013:ACT-EXP,ACT/EXP= 1.035485400e-06, 1.000 +TARGET014:ACT-EXP,ACT/EXP= 3.037023195e-05, 1.000 +TARGET015:ACT-EXP,ACT/EXP= 5.381491532e-05, 1.000 +TARGET016:ACT-EXP,ACT/EXP= -1.456029767e-05, 1.000 +DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=16): + with REGULARIZATION_DELTA= 1.000000e-09 +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.996000, < 1.004000: 16 16 100.00% 100.00% +MINIMUM VALUE OF TARGET ACT/EXP RATIO = 1.000 +MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.000 +AREA-OPTIMIZED_TARGET_RMSE= 1.043743910e-04 +DISTRIBUTION OF AREA/US WEIGHT RATIO (n=225256): + with REGULARIZATION_DELTA= 1.000000e-09 +low bin ratio high bin ratio bin # cum # bin % cum % +>= 0.000000, < 0.000001: 439 439 0.19% 0.19% +>= 0.000001, < 0.100000: 51017 51456 22.65% 22.84% +>= 0.100000, < 0.200000: 7951 59407 3.53% 26.37% +>= 0.200000, < 0.500000: 17881 77288 7.94% 34.31% +>= 0.500000, < 0.800000: 28460 105748 12.63% 46.95% +>= 0.800000, < 0.850000: 9328 115076 4.14% 51.09% +>= 0.850000, < 0.900000: 12651 127727 5.62% 56.70% +>= 0.900000, < 0.950000: 15224 142951 6.76% 63.46% +>= 0.950000, < 1.000000: 19312 162263 8.57% 72.03% +>= 1.000000, < 1.050000: 11414 173677 5.07% 77.10% +>= 1.050000, < 1.100000: 5648 179325 2.51% 79.61% +>= 1.100000, < 1.150000: 4572 183897 2.03% 81.64% +>= 1.150000, < 1.200000: 4280 188177 1.90% 83.54% +>= 1.200000, < 2.000000: 28367 216544 12.59% 96.13% +>= 2.000000, < 5.000000: 8242 224786 3.66% 99.79% +>= 5.000000, < 10.000000: 420 225206 0.19% 99.98% +>= 10.000000, < 100.000000: 50 225256 0.02% 100.00% +RMSE OF AREA/US WEIGHT RATIO DEVIATIONS FROM ONE = 7.455945e-01 From 24e05cb22f89dd8e4d153232c34d404f37e55126 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 05:43:05 -0500 Subject: [PATCH 02/33] update examine references to cached_allvars.csv --- tmd/areas/weights/examine/R/test_read.R | 2 +- ..._get_tmd_and_summarize_and_save_cd_weighted_results.qmd | 7 +++---- .../weights/examine/issue279_comment_onPR267_chisquare.R | 3 ++- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tmd/areas/weights/examine/R/test_read.R b/tmd/areas/weights/examine/R/test_read.R index f407c783..1002b563 100644 --- a/tmd/areas/weights/examine/R/test_read.R +++ b/tmd/areas/weights/examine/R/test_read.R @@ -26,7 +26,7 @@ mhhi2022 <- get_acs( # tmd2021_cache.csv ------------------------------------------------------- TMDDIR <- here::here("..", "..", "..", "storage", "output") -fpath <- fs::path(TMDDIR, "tmd2021_cache.csv") +fpath <- fs::path(TMDDIR, "cached_allvars.csv") # tmd2021 <- read_csv(fpath) tmd2021 <- vroom(fpath) ns(tmd2021) diff --git a/tmd/areas/weights/examine/cd_get_tmd_and_summarize_and_save_cd_weighted_results.qmd b/tmd/areas/weights/examine/cd_get_tmd_and_summarize_and_save_cd_weighted_results.qmd index 79204d41..f0e22ab4 100644 --- a/tmd/areas/weights/examine/cd_get_tmd_and_summarize_and_save_cd_weighted_results.qmd +++ b/tmd/areas/weights/examine/cd_get_tmd_and_summarize_and_save_cd_weighted_results.qmd @@ -12,7 +12,7 @@ Because this is time consuming and uses a lot of memory, and may not work well o Get data and merge: -- tmd2021 data used when creating area weights (tmd2021_cache.csv) +- tmd2021 data used when creating area weights (cached_allvars.csv) - weights - merge tmd2021 with weights @@ -43,9 +43,8 @@ source(here::here("R", "functions.R")) ## Get tmd2021 file -Get `tmd2021_cache.csv`, a saved version of data from an object constructed during creation of area weights, in the file `create_taxcalc_cached_files.py`. `tmd2021_cache.csv` is the then-current tmd file with 2021 values, run through Tax-Calculator with 2021 law, written as csv. It includes all Tax-Calculator input and output variables. +Get `cached_allvars.csv`, a saved version of data from an object constructed during creation of area weights, in the file `create_taxcalc_cached_files.py`. `cached_allvars.csv` is the then-current tmd file with 2021 values, run through Tax-Calculator with 2021 law, written as csv. It includes all Tax-Calculator input and output variables. -The master branch of tax-microdata-benchmarking does not write `tmd2021_cache.csv` (it saves a few key variables in a cache), but it is written in Don Boyd's fork, branch xprep2, (see commit fe0c687433217ef455e359d3f2450c9e3e18cc06). ```{r} #| label: get-tmd @@ -53,7 +52,7 @@ The master branch of tax-microdata-benchmarking does not write `tmd2021_cache.cs #| output: false TMDDIR <- here::here("..", "..", "..", "storage", "output") -fpath <- fs::path(TMDDIR, "tmd2021_cache.csv") +fpath <- fs::path(TMDDIR, "cached_allvars.csv") # tmd2021_cache.csv tmd2021 <- vroom(fpath) # ~ 600mb ns(tmd2021) # tmd2021 |> filter(row_number() < 20) |> select(RECID, s006, c00100) diff --git a/tmd/areas/weights/examine/issue279_comment_onPR267_chisquare.R b/tmd/areas/weights/examine/issue279_comment_onPR267_chisquare.R index 9fff8588..9fca2834 100644 --- a/tmd/areas/weights/examine/issue279_comment_onPR267_chisquare.R +++ b/tmd/areas/weights/examine/issue279_comment_onPR267_chisquare.R @@ -52,7 +52,8 @@ targ64 <- bind_rows( # tmd2021 ---- TMDDIR <- here::here("..", "..", "..", "storage", "output") -fpath <- fs::path(TMDDIR, "tmd2021_cache.csv") +# filename has changed since this was done - now cached_allvars.csv +fpath <- fs::path(TMDDIR, "tmd2021_cache.csv") tmd2021 <- vroom(fpath) ns(tmd2021) From 50f8d9a04e19e71bd806f262357918ec1c5afa9f Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 05:46:32 -0500 Subject: [PATCH 03/33] update prepare references to cached_allvars.csv --- .../prepare/cd_enhance_basefile_with_special_targets.qmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_enhance_basefile_with_special_targets.qmd b/tmd/areas/targets/prepare/cd_enhance_basefile_with_special_targets.qmd index 71924d80..e912d025 100644 --- a/tmd/areas/targets/prepare/cd_enhance_basefile_with_special_targets.qmd +++ b/tmd/areas/targets/prepare/cd_enhance_basefile_with_special_targets.qmd @@ -80,7 +80,7 @@ Get the unenhanced target data and the tmd2021 cached data. stack <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) TMDDIR <- here::here("..", "..", "..", "storage", "output") -fpath <- fs::path(TMDDIR, "tmd2021_cache.csv") +fpath <- fs::path(TMDDIR, "cached_allvars.csv") tmd2021 <- vroom(fpath) ns(tmd2021) @@ -127,7 +127,7 @@ stack |> # IRS values in 2015 per PUF documentation: # Full sample 2015: 35,270,132,000 # PUF 2015: 35,280,959,000 -# tmd2021_cache.csv data_source==1: +# cached_allvars.csv data_source==1: # e18400: 445,295,833,890 # e18400_capped: 445,295,833,890 # Note: IRS national 2021 capped amount (per my notes) was $258.640 billion, not too far from the national CD N18425 amount of $249.3 billion @@ -137,7 +137,7 @@ stack |> # IRS values in 2015 per PUF documentation # Full sample 2015: 18,860,584,000 # PUF 2015: 18,885,310,000 -# tmd2021_cache.csv data_source==1: +# cached_allvars.csv data_source==1: # e18500: 226,186,527,940 # e18500_capped: 226,186,527,940 @@ -299,7 +299,7 @@ setdiff(names(salt_final), names(stack)) We only have 2021 taxable Social Security values in the Congressional district data (A02500 and N02500). We'd really like to have total Social Security. -The tmd2021_cache.csv has total Social Security (e02400) and calculated taxable Social Security (c02500). +File cached_allvars.csv has total Social Security (e02400) and calculated taxable Social Security (c02500). (See commented-out lines in the code chunk below for details on what's in each file.) From c21a1dd05818cc7c04490d25be748316d705e639 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 05:53:57 -0500 Subject: [PATCH 04/33] minor edit --- tmd/areas/targets/prepare/cd_create_variable_mapping.qmd | 2 -- 1 file changed, 2 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_create_variable_mapping.qmd b/tmd/areas/targets/prepare/cd_create_variable_mapping.qmd index bbb036ee..3487af51 100644 --- a/tmd/areas/targets/prepare/cd_create_variable_mapping.qmd +++ b/tmd/areas/targets/prepare/cd_create_variable_mapping.qmd @@ -15,8 +15,6 @@ We do this in its own qmd file to make it easy to find, because it is also used source(here::here("R", "libraries.R")) source(here::here("R", "constants.R")) -# 334283385.27000004 national pop - ``` ## Create and save the mapping From 10c9c034ec338557ffb7d45f314f4a0d595bf800 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 05:54:26 -0500 Subject: [PATCH 05/33] create qmd file for comparison --- ...compare_us_totals_tmd_vs_irs_published.qmd | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd new file mode 100644 index 00000000..3ff262c7 --- /dev/null +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -0,0 +1,24 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +# Compare U.S. totals of mapped variables, tax-microdata-benchmarking vs. IRS published CD values + + +## Setup +```{r} +#| label: setup + +source(here::here("R", "libraries.R")) +source(here::here("R", "constants.R")) + +``` + +## Get data + +```{r} + +``` + From 9e65735d7ed2d2ca6df67176f6e047047a4d38de Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 05:58:38 -0500 Subject: [PATCH 06/33] load utility functions --- .../prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd | 1 + 1 file changed, 1 insertion(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 3ff262c7..08478135 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -13,6 +13,7 @@ editor_options: source(here::here("R", "libraries.R")) source(here::here("R", "constants.R")) +source(here::here("R", "functions.R")) ``` From ebf6bb6f5eab6a05e9a247ea2dc01e8efba75b6c Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 05:59:04 -0500 Subject: [PATCH 07/33] get data --- .../prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 08478135..33277d50 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -21,5 +21,13 @@ source(here::here("R", "functions.R")) ```{r} +vmap <- read_csv(fs::path(CDINTERMEDIATE, "cd_variable_mapping.csv")) +cdirs <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) + +TMDDIR <- here::here("..", "..", "..", "storage", "output") +fpath <- fs::path(TMDDIR, "cached_allvars.csv") +tmd2021 <- vroom(fpath) +ns(tmd2021) + ``` From f7344af9ffba7aa0c853c9da8410c02d89fff3eb Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 06:13:35 -0500 Subject: [PATCH 08/33] summarize CD data for vmapped variables --- ...compare_us_totals_tmd_vs_irs_published.qmd | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 33277d50..5178404b 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -20,6 +20,8 @@ source(here::here("R", "functions.R")) ## Get data ```{r} +#| label: get-data +#| output: false vmap <- read_csv(fs::path(CDINTERMEDIATE, "cd_variable_mapping.csv")) cdirs <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) @@ -31,3 +33,31 @@ ns(tmd2021) ``` +## Create comparison file + +```{r} +#| label: create-comp +#| output: false + +vmap + +#.. cd data ---- +cd2 <- cdirs |> + filter(basevname %in% vmap$basevname, + session==118, + scope==1) + +count(cd2, basevname, vname) +count(cd2, rectype) +skim(cd2) + +cd3 <- cd2 |> + summarise(target=sum(target), + .by=c(basevname, count, agistub, agirange, description)) + +#.. tmd data ---- + + + +``` + From a0d4aab2ac6a4febf025e8e042f9dfab372a8699 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 07:25:30 -0500 Subject: [PATCH 09/33] prepare cd data --- ...compare_us_totals_tmd_vs_irs_published.qmd | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 5178404b..cdd3e20c 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -35,29 +35,34 @@ ns(tmd2021) ## Create comparison file +Prepare Congressional district data. + ```{r} -#| label: create-comp +#| label: prepare-cd-data #| output: false -vmap +# vmap -#.. cd data ---- cd2 <- cdirs |> filter(basevname %in% vmap$basevname, session==118, - scope==1) + scope==1 | basevname=="XTOT") count(cd2, basevname, vname) count(cd2, rectype) +count(cd2, scope) +count(cd2, fstatus) +count(cd2 |> filter(str_detect(vname, "MARS")), + vname, fstatus) skim(cd2) cd3 <- cd2 |> summarise(target=sum(target), .by=c(basevname, count, agistub, agirange, description)) -#.. tmd data ---- - +``` +Prepare tmd data -``` +```{r} From ab020d0a7b4e7388e9b52f8c2602b7fa4e348b67 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 07:28:04 -0500 Subject: [PATCH 10/33] agi range documentation --- ...compare_us_totals_tmd_vs_irs_published.qmd | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index cdd3e20c..57c2c8a1 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -65,4 +65,31 @@ cd3 <- cd2 |> Prepare tmd data ```{r} +#| label: prepare-tmd-data +#| eval: false +#| output: false + +# icuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) +# icuts <- CDICUTS + +# 0 = Total +# 1 = Under $1 +# 2 = $1 under $10,000 +# 3 = $10,000 under $25,000 +# 4 = $25,000 under $50,000 +# 5 = $50,000 under $75,000 +# 6 = $75,000 under $100,000 +# 7 = $100,000 under $200,000 +# 8 = $200,000 under $500,000 +# 9 = $500,000 or more + +vmap + +vmap2 <- vmap |> + mutate(varname=ifelse(str_starts(basevname, "MARS"), + basevname, + varname)) + +agicuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) + From 3a677177f8d82a74ecd573d39832c549093e6aec Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 07:29:23 -0500 Subject: [PATCH 11/33] summarize tmd file by categoricals --- .../cd_compare_us_totals_tmd_vs_irs_published.qmd | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 57c2c8a1..b201e666 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -92,4 +92,16 @@ vmap2 <- vmap |> agicuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) +tmd2 <- tmd2021 |> + mutate(scope=ifelse(data_source==0, 2, 1), + MARS1=MARS==1, MARS2=MARS==2, MARS4=MARS==4, + agistub=cut(c00100, agicuts, right = FALSE, ordered_result = TRUE) |> + as.integer()) |> + summarize(across(all_of(vmap2$varname), + list(amount = \(x) sum(x * s006), + nzcount = \(x) sum(x!=0 * s006), + allcount= \(x) sum(s006))), + .by=c(scope, agistub)) |> + arrange(scope, agistub) + From ba6d6b9e4727d06ac591bf6a2fdd6b1f61e23778 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 07:30:33 -0500 Subject: [PATCH 12/33] make long summarized file and create categorical for amounts and counts --- .../cd_compare_us_totals_tmd_vs_irs_published.qmd | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index b201e666..05b4d30f 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -104,4 +104,14 @@ tmd2 <- tmd2021 |> .by=c(scope, agistub)) |> arrange(scope, agistub) +# flip around and get count +tmd3 <- tmd2 |> + pivot_longer(cols=-c(scope, agistub)) |> + separate_wider_delim(cols=name, delim="_", names=c("varname", "type")) |> + mutate(count = case_when( + type == "amount" ~ 0, + type == "nzcount" ~ 2, + type == "allcount" ~ 1, + .default = -9e9)) +count(tmd3, count, type) From d694809db32fee56edf0733bfb7cae3e90c623f3 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 07:30:57 -0500 Subject: [PATCH 13/33] create non-MARS dataframe --- .../cd_compare_us_totals_tmd_vs_irs_published.qmd | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 05b4d30f..95150bdf 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -115,3 +115,17 @@ tmd3 <- tmd2 |> .default = -9e9)) count(tmd3, count, type) +# separate the mars and nonmars variables to get proper mars values +tmdxmars <- tmd3 |> + filter(str_detect(varname, "MARS", negate = TRUE)) |> + mutate(fstatus=0) |> + select(varname, scope, fstatus, count, agistub, value) |> + arrange(varname, scope, fstatus, count, agistub) + +# tmdmars + + +``` + + + From 140cfa74502257371a750c36bce0b47f4d0d0c8b Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 07:48:25 -0500 Subject: [PATCH 14/33] format mars records consistent with how target files treat them --- ...compare_us_totals_tmd_vs_irs_published.qmd | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 95150bdf..654e1606 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -122,8 +122,25 @@ tmdxmars <- tmd3 |> select(varname, scope, fstatus, count, agistub, value) |> arrange(varname, scope, fstatus, count, agistub) -# tmdmars - +# check tmd mars totals + tmd3 |> + filter(str_detect(varname, "MARS")) |> + summarize(value=sum(value), + .by=c(varname, scope, type, count)) + # this helps verify that we want type=="amount" as the number of returns + # nothing else is useful + # CAUTION: to be consistent with the xxxx_targets.csv file format rules + # we need to set count to 1 (number of units with any value) INSTEAD of zero + # and by convention we'll use c00100 as the variable name but any variable would be the same + +tmdmars <- tmd3 |> + filter(str_detect(varname, "MARS"), + type=="amount") |> + mutate(fstatus=str_sub(varname, -1) |> + as.integer(), + varname="c00100") |> + select(varname, scope, fstatus, count, agistub, value) |> + arrange(varname, scope, fstatus, count, agistub) ``` From 9255869e3a405ab9df63a1a5fc9bced4a90c5e37 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 07:58:42 -0500 Subject: [PATCH 15/33] fix treatment of XTOT in the non-MARS dataframe --- .../prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd | 1 + 1 file changed, 1 insertion(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 654e1606..5315c4a3 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -118,6 +118,7 @@ count(tmd3, count, type) # separate the mars and nonmars variables to get proper mars values tmdxmars <- tmd3 |> filter(str_detect(varname, "MARS", negate = TRUE)) |> + filter(!(varname=="XTOT" & type != "amount")) |> mutate(fstatus=0) |> select(varname, scope, fstatus, count, agistub, value) |> arrange(varname, scope, fstatus, count, agistub) From 5ab857c3d1470790de6165934017c01f5d905137 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 07:59:26 -0500 Subject: [PATCH 16/33] combine adjusted MARS and non-MARS dataframes --- .../prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 5315c4a3..f332817c 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -143,6 +143,8 @@ tmdmars <- tmd3 |> select(varname, scope, fstatus, count, agistub, value) |> arrange(varname, scope, fstatus, count, agistub) +# combine files and create selected additional totals +tmdsums1 <- bind_rows(tmdxmars, tmdmars) ``` From e1f3bb42ef2a2eb82db7ebd8808159f0e75d664f Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 08:00:19 -0500 Subject: [PATCH 17/33] combine with totals across all agistubs (new agistub=0) --- .../cd_compare_us_totals_tmd_vs_irs_published.qmd | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index f332817c..95a2e15a 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -145,6 +145,16 @@ tmdmars <- tmd3 |> # combine files and create selected additional totals tmdsums1 <- bind_rows(tmdxmars, tmdmars) + +# concatenate totals across agi ranges +tmd_agitots <- tmdsums1 |> + summarise(value=sum(value), + .by=c(varname, scope, fstatus, count)) |> + mutate(agistub=0) |> + bind_rows(tmdsums1) + + + ``` From e3aae2804accca1ab545e871ba9c24df67f91e17 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 08:01:02 -0500 Subject: [PATCH 18/33] combine with totals across all scopes (new scope=0) --- .../cd_compare_us_totals_tmd_vs_irs_published.qmd | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 95a2e15a..2e9c509e 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -153,6 +153,16 @@ tmd_agitots <- tmdsums1 |> mutate(agistub=0) |> bind_rows(tmdsums1) +# concatenate totals across scopes +tmd_scopes <- tmd_agitots |> + summarise(value=sum(value), + .by=c(varname, agistub, fstatus, count)) |> + mutate(scope=0) |> + bind_rows(tmd_agitots) |> + select(varname, scope, fstatus, count, agistub, value) |> + arrange(varname, scope, fstatus, count, agistub) + + ``` From 59ecf987f2dee3c14ba4594588ec7944605873e3 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 10:42:35 -0500 Subject: [PATCH 19/33] create cd_adjusted --- .../prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 2e9c509e..0fdd483f 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -56,9 +56,9 @@ count(cd2 |> filter(str_detect(vname, "MARS")), vname, fstatus) skim(cd2) -cd3 <- cd2 |> +cd_adjusted <- cd2 |> summarise(target=sum(target), - .by=c(basevname, count, agistub, agirange, description)) + .by=c(basevname, vname, scope, count, fstatus, agistub, agirange, description)) ``` From eaaabdb2296fe663ef93c612527ff65cf2b9e576 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 10:43:35 -0500 Subject: [PATCH 20/33] improve use of vmap --- ...compare_us_totals_tmd_vs_irs_published.qmd | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 0fdd483f..13874a33 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -83,12 +83,9 @@ Prepare tmd data # 8 = $200,000 under $500,000 # 9 = $500,000 or more -vmap - -vmap2 <- vmap |> - mutate(varname=ifelse(str_starts(basevname, "MARS"), - basevname, - varname)) +variables_to_sum <- ifelse(str_starts(vmap$basevname, "MARS"), + vmap$basevname, + vmap$varname) agicuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) @@ -97,7 +94,7 @@ tmd2 <- tmd2021 |> MARS1=MARS==1, MARS2=MARS==2, MARS4=MARS==4, agistub=cut(c00100, agicuts, right = FALSE, ordered_result = TRUE) |> as.integer()) |> - summarize(across(all_of(vmap2$varname), + summarize(across(all_of(variables_to_sum), list(amount = \(x) sum(x * s006), nzcount = \(x) sum(x!=0 * s006), allcount= \(x) sum(s006))), @@ -162,10 +159,28 @@ tmd_scopes <- tmd_agitots |> select(varname, scope, fstatus, count, agistub, value) |> arrange(varname, scope, fstatus, count, agistub) +vmap2 <- vmap |> + mutate(fstatus=case_when(basevname=="MARS1" ~ 1, + basevname=="MARS2" ~ 2, + basevname=="MARS4" ~ 4, + .default = 0) |> + as.integer()) +tmd_adjusted <- tmd_scopes |> + left_join(vmap2, + by = join_by(varname, fstatus)) ``` +## Prepare comaparison file + +```{r} +#| label: prepare-compare +#| eval: false +#| output: false + + +``` From 55fe4790543af70e1ca26e9135c044f86fcd6bc3 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 10:48:27 -0500 Subject: [PATCH 21/33] minor edits --- ..._compare_us_totals_tmd_vs_irs_published.qmd | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 13874a33..1f27a7b8 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -41,8 +41,6 @@ Prepare Congressional district data. #| label: prepare-cd-data #| output: false -# vmap - cd2 <- cdirs |> filter(basevname %in% vmap$basevname, session==118, @@ -69,9 +67,6 @@ Prepare tmd data #| eval: false #| output: false -# icuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) -# icuts <- CDICUTS - # 0 = Total # 1 = Under $1 # 2 = $1 under $10,000 @@ -83,12 +78,12 @@ Prepare tmd data # 8 = $200,000 under $500,000 # 9 = $500,000 or more +agicuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) + variables_to_sum <- ifelse(str_starts(vmap$basevname, "MARS"), vmap$basevname, vmap$varname) -agicuts <- c(-Inf, 1, 10e3, 25e3, 50e3, 75e3, 100e3, 200e3, 500e3, Inf) - tmd2 <- tmd2021 |> mutate(scope=ifelse(data_source==0, 2, 1), MARS1=MARS==1, MARS2=MARS==2, MARS4=MARS==4, @@ -140,10 +135,8 @@ tmdmars <- tmd3 |> select(varname, scope, fstatus, count, agistub, value) |> arrange(varname, scope, fstatus, count, agistub) -# combine files and create selected additional totals +# combine files and concatenate totals across agi ranges tmdsums1 <- bind_rows(tmdxmars, tmdmars) - -# concatenate totals across agi ranges tmd_agitots <- tmdsums1 |> summarise(value=sum(value), .by=c(varname, scope, fstatus, count)) |> @@ -159,6 +152,7 @@ tmd_scopes <- tmd_agitots |> select(varname, scope, fstatus, count, agistub, value) |> arrange(varname, scope, fstatus, count, agistub) +# we need to put fstatus on vmap for proper merging vmap2 <- vmap |> mutate(fstatus=case_when(basevname=="MARS1" ~ 1, basevname=="MARS2" ~ 2, @@ -170,7 +164,6 @@ tmd_adjusted <- tmd_scopes |> left_join(vmap2, by = join_by(varname, fstatus)) - ``` @@ -181,6 +174,9 @@ tmd_adjusted <- tmd_scopes |> #| eval: false #| output: false +comp <- tmd_adjusted |> + left_join(cd_adjusted, + by = join_by(scope, fstatus, count, agistub, basevname, description)) ``` From 0ad4dcb63c209a78f899a4487242310cd8c1bdf8 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 10:56:09 -0500 Subject: [PATCH 22/33] create comparison file --- .../cd_compare_us_totals_tmd_vs_irs_published.qmd | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 1f27a7b8..c6d59d90 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -175,8 +175,15 @@ tmd_adjusted <- tmd_scopes |> #| output: false comp <- tmd_adjusted |> - left_join(cd_adjusted, - by = join_by(scope, fstatus, count, agistub, basevname, description)) + select(-description) |> + inner_join(cd_adjusted, + by = join_by(scope, fstatus, count, agistub, basevname)) |> + relocate(value, .after = target) |> + mutate(diff=value - target, + pdiff=diff / target) +summary(comp) +skim(comp) + ``` From 78064783014274a58c7dcb7017cbb7f04b2dbce9 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 11:27:26 -0500 Subject: [PATCH 23/33] set eval: true in 2 chunks --- .../prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index c6d59d90..60c92ba7 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -64,7 +64,7 @@ Prepare tmd data ```{r} #| label: prepare-tmd-data -#| eval: false +#| eval: true #| output: false # 0 = Total @@ -171,7 +171,7 @@ tmd_adjusted <- tmd_scopes |> ```{r} #| label: prepare-compare -#| eval: false +#| eval: true #| output: false comp <- tmd_adjusted |> From 943d9594d8d9236d9f5e3999d26ed9e19dd61ac0 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 11:52:41 -0500 Subject: [PATCH 24/33] prelim creation of comparison file and start of debugging --- ...compare_us_totals_tmd_vs_irs_published.qmd | 73 ++++++++++++++++--- 1 file changed, 64 insertions(+), 9 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index 60c92ba7..f2687ef6 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -41,11 +41,15 @@ Prepare Congressional district data. #| label: prepare-cd-data #| output: false +count(cdirs, count) + cd2 <- cdirs |> filter(basevname %in% vmap$basevname, session==118, scope==1 | basevname=="XTOT") +glimpse(cd2) +count(cd2, count) count(cd2, basevname, vname) count(cd2, rectype) count(cd2, scope) @@ -96,9 +100,15 @@ tmd2 <- tmd2021 |> .by=c(scope, agistub)) |> arrange(scope, agistub) +# look at nzcounts +tmd2 |> select(scope, agistub, contains("nzcount")) +tmd2 |> select(scope, agistub, contains("allcount")) +tmd2 |> select(scope, agistub, contains("amount")) + + # flip around and get count tmd3 <- tmd2 |> - pivot_longer(cols=-c(scope, agistub)) |> + pivot_longer(cols=-c(scope, agistub), values_to = "wtdvalue") |> separate_wider_delim(cols=name, delim="_", names=c("varname", "type")) |> mutate(count = case_when( type == "amount" ~ 0, @@ -107,18 +117,25 @@ tmd3 <- tmd2 |> .default = -9e9)) count(tmd3, count, type) +tmd3 |> filter(count==2, agistub==4) # looks good +tmd3 |> filter(count==1, agistub==4) +# 37,694,755 is the bad val allcount + + # separate the mars and nonmars variables to get proper mars values tmdxmars <- tmd3 |> filter(str_detect(varname, "MARS", negate = TRUE)) |> filter(!(varname=="XTOT" & type != "amount")) |> mutate(fstatus=0) |> - select(varname, scope, fstatus, count, agistub, value) |> + select(varname, scope, fstatus, count, agistub, wtdvalue) |> arrange(varname, scope, fstatus, count, agistub) +# tmdxmars |> filter(varname=="e00200", agistub==4) # we'll want count==2 for counts -- nonzero counts + # check tmd mars totals tmd3 |> filter(str_detect(varname, "MARS")) |> - summarize(value=sum(value), + summarize(wtdvalue=sum(wtdvalue), .by=c(varname, scope, type, count)) # this helps verify that we want type=="amount" as the number of returns # nothing else is useful @@ -132,24 +149,24 @@ tmdmars <- tmd3 |> mutate(fstatus=str_sub(varname, -1) |> as.integer(), varname="c00100") |> - select(varname, scope, fstatus, count, agistub, value) |> + select(varname, scope, fstatus, count, agistub, wtdvalue) |> arrange(varname, scope, fstatus, count, agistub) # combine files and concatenate totals across agi ranges tmdsums1 <- bind_rows(tmdxmars, tmdmars) tmd_agitots <- tmdsums1 |> - summarise(value=sum(value), + summarise(wtdvalue=sum(wtdvalue), .by=c(varname, scope, fstatus, count)) |> mutate(agistub=0) |> bind_rows(tmdsums1) # concatenate totals across scopes tmd_scopes <- tmd_agitots |> - summarise(value=sum(value), + summarise(wtdvalue=sum(wtdvalue), .by=c(varname, agistub, fstatus, count)) |> mutate(scope=0) |> bind_rows(tmd_agitots) |> - select(varname, scope, fstatus, count, agistub, value) |> + select(varname, scope, fstatus, count, agistub, wtdvalue) |> arrange(varname, scope, fstatus, count, agistub) # we need to put fstatus on vmap for proper merging @@ -178,12 +195,50 @@ comp <- tmd_adjusted |> select(-description) |> inner_join(cd_adjusted, by = join_by(scope, fstatus, count, agistub, basevname)) |> - relocate(value, .after = target) |> - mutate(diff=value - target, + relocate(wtdvalue, .after = target) |> + mutate(diff=wtdvalue - target, pdiff=diff / target) summary(comp) skim(comp) +comp |> + arrange(desc(abs(pdiff))) + +badmatches <- c("e18400", "e18500", "e02400") # variables where tmd and IRS concepts are not well aligned +comp |> + filter(!varname %in% badmatches, count==1) |> + arrange(desc(abs(pdiff))) + +comp |> + filter(!varname %in% badmatches, count==1) |> + arrange(desc(wtdvalue)) + +comp |> + filter(!varname %in% badmatches, count==1, agistub==4) |> + arrange(desc(wtdvalue)) + +check <- comp |> + filter(!varname %in% badmatches, count==0) |> + arrange(desc(abs(pdiff))) + +check |> + filter(agistub==0) + +check |> + filter(agistub==9) + +verybad <- check |> + filter(abs(pdiff) >= .3) + +verybad +verybad |> + filter(agistub==0) + +# Lessons: +# - e26270 Partnership / S Corp looks like it could be a conceptual mismatch?? Or some other problem in concept? +# agistub 0 is within 1.5% but ranges are way off +# should we create a shared-down variable? +# - e00300 taxable interest seems a little off ``` From fb53b43193bd970d82612106c994645f07f34a6d Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 13:18:56 -0500 Subject: [PATCH 25/33] add code to several files to load utility functions --- tmd/areas/targets/prepare/cd_construct_long_soi_data_file.qmd | 1 + .../cd_create_basefile_for_117Congress_cd_target_files.qmd | 1 + .../cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd | 1 + tmd/areas/targets/prepare/cd_create_variable_mapping.qmd | 1 + 4 files changed, 4 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_construct_long_soi_data_file.qmd b/tmd/areas/targets/prepare/cd_construct_long_soi_data_file.qmd index e4df1f18..d81aecaf 100644 --- a/tmd/areas/targets/prepare/cd_construct_long_soi_data_file.qmd +++ b/tmd/areas/targets/prepare/cd_construct_long_soi_data_file.qmd @@ -17,6 +17,7 @@ This involves cleaning the SOI Congressional District data, adding agi bin infor source(here::here("R", "libraries.R")) source(here::here("R", "constants.R")) +source(here::here("R", "functions.R")) # 334283385.27000004 national pop diff --git a/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd b/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd index add1b52a..af7560b9 100644 --- a/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd +++ b/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd @@ -15,6 +15,7 @@ This section creates one long file that is a superset of what we need for indivi source(here::here("R", "libraries.R")) source(here::here("R", "constants.R")) +source(here::here("R", "functions.R")) # 334283385.27000004 national pop diff --git a/tmd/areas/targets/prepare/cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd b/tmd/areas/targets/prepare/cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd index 5d18affb..4bdeaafb 100644 --- a/tmd/areas/targets/prepare/cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd +++ b/tmd/areas/targets/prepare/cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd @@ -22,6 +22,7 @@ Doing this requires a crosswalk that shows what fraction of the 2020 population source(here::here("R", "libraries.R")) source(here::here("R", "constants.R")) +source(here::here("R", "functions.R")) # phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") diff --git a/tmd/areas/targets/prepare/cd_create_variable_mapping.qmd b/tmd/areas/targets/prepare/cd_create_variable_mapping.qmd index 3487af51..2f0b27a5 100644 --- a/tmd/areas/targets/prepare/cd_create_variable_mapping.qmd +++ b/tmd/areas/targets/prepare/cd_create_variable_mapping.qmd @@ -14,6 +14,7 @@ We do this in its own qmd file to make it easy to find, because it is also used source(here::here("R", "libraries.R")) source(here::here("R", "constants.R")) +source(here::here("R", "functions.R")) ``` From c3c534d45b440d1c8ae2740f348803d9d542c722 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 13:23:55 -0500 Subject: [PATCH 26/33] fix count categorical variable so that count==2 for counts of nonzero values --- .../cd_create_basefile_for_117Congress_cd_target_files.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd b/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd index af7560b9..9c22ff6d 100644 --- a/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd +++ b/tmd/areas/targets/prepare/cd_create_basefile_for_117Congress_cd_target_files.qmd @@ -124,7 +124,7 @@ cdlong4 <- cdlong3 |> .default = 0), count = case_when( - vtype == "count" ~ 1, + vtype == "count" ~ 2, # correction - used to be 1 vtype == "amount" ~ 0, .default = -99), From dd67a0532488256edfc224fd15097a62813a8bd2 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 16:55:31 -0500 Subject: [PATCH 27/33] remove old log --- tmd/areas/weights/ny21.log | 61 -------------------------------------- 1 file changed, 61 deletions(-) delete mode 100644 tmd/areas/weights/ny21.log diff --git a/tmd/areas/weights/ny21.log b/tmd/areas/weights/ny21.log deleted file mode 100644 index c113cbe2..00000000 --- a/tmd/areas/weights/ny21.log +++ /dev/null @@ -1,61 +0,0 @@ -CREATING WEIGHTS FILE FOR AREA ny21 ... -INITIAL WEIGHTS STATISTICS: -sum of national weights = 1.840247e+08 -area weights_scale = 2.329781e-03 -USING ny21_targets.csv FILE WITH 82 TARGETS -ASSUMING TARGET_RATIO_TOLERANCE = 0.004000 -DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=82): -low bin ratio high bin ratio bin # cum # bin % cum % ->= 0.400000, < 0.800000: 4 4 4.88% 4.88% ->= 0.800000, < 0.900000: 3 7 3.66% 8.54% ->= 0.900000, < 0.990000: 12 19 14.63% 23.17% ->= 0.990000, < 0.996000: 0 19 0.00% 23.17% ->= 0.996000, < 1.004000: 2 21 2.44% 25.61% ->= 1.004000, < 1.010000: 1 22 1.22% 26.83% ->= 1.010000, < 1.100000: 9 31 10.98% 37.80% ->= 1.100000, < 1.200000: 9 40 10.98% 48.78% ->= 1.200000, < 1.600000: 14 54 17.07% 65.85% ->= 1.600000, < 2.000000: 12 66 14.63% 80.49% ->= 2.000000, < 3.000000: 14 80 17.07% 97.56% ->= 3.000000, < 4.000000: 2 82 2.44% 100.00% -MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.669 -MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 3.083 -US_PROPORTIONALLY_SCALED_TARGET_RMSE= 7.375708784e-01 -target_matrix sparsity ratio = 0.929 -OPTIMIZE WEIGHT RATIOS POSSIBLY IN A REGULARIZATION LOOP - where initial REGULARIZATION DELTA value is 1.000000e-09 - and there is only one REGULARIZATION LOOP - and where target_matrix.shape= (225256, 82) - ::loop,delta,misses: 1 1.000000e-09 0 ->>> final delta loop iterations=407 success=True ->>> message: CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH ->>> L-BFGS-B optimized objective function value: 3.814234922e-05 -AREA-OPTIMIZED_TARGET_MISSES= 0 -DISTRIBUTION OF TARGET ACT/EXP RATIOS (n=82): - with REGULARIZATION_DELTA= 1.000000e-09 -low bin ratio high bin ratio bin # cum # bin % cum % ->= 0.996000, < 1.004000: 82 82 100.00% 100.00% -MINIMUM VALUE OF TARGET ACT/EXP RATIO = 0.999 -MAXIMUM VALUE OF TARGET ACT/EXP RATIO = 1.002 -AREA-OPTIMIZED_TARGET_RMSE= 3.582120066e-04 -DISTRIBUTION OF AREA/US WEIGHT RATIO (n=225256): - with REGULARIZATION_DELTA= 1.000000e-09 -low bin ratio high bin ratio bin # cum # bin % cum % ->= 0.000000, < 0.000001: 27 27 0.01% 0.01% ->= 0.000001, < 0.100000: 9749 9776 4.33% 4.34% ->= 0.100000, < 0.200000: 4139 13915 1.84% 6.18% ->= 0.200000, < 0.500000: 16007 29922 7.11% 13.28% ->= 0.500000, < 0.800000: 34755 64677 15.43% 28.71% ->= 0.800000, < 0.850000: 12872 77549 5.71% 34.43% ->= 0.850000, < 0.900000: 18966 96515 8.42% 42.85% ->= 0.900000, < 0.950000: 19631 116146 8.71% 51.56% ->= 0.950000, < 1.000000: 35814 151960 15.90% 67.46% ->= 1.000000, < 1.050000: 31899 183859 14.16% 81.62% ->= 1.050000, < 1.100000: 13056 196915 5.80% 87.42% ->= 1.100000, < 1.150000: 8547 205462 3.79% 91.21% ->= 1.150000, < 1.200000: 6076 211538 2.70% 93.91% ->= 1.200000, < 2.000000: 13112 224650 5.82% 99.73% ->= 2.000000, < 5.000000: 603 225253 0.27% 100.00% ->= 5.000000, < 10.000000: 2 225255 0.00% 100.00% ->= 10.000000, < 100.000000: 1 225256 0.00% 100.00% -RMSE OF AREA/US WEIGHT RATIO DEVIATIONS FROM ONE = 3.501686e-01 From 0dceca20da74b342b6a42a037f4ca8a124521027 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 16:55:57 -0500 Subject: [PATCH 28/33] mention where it's ok to use eval: false --- .../prepare/cd_enhance_basefile_with_special_targets.qmd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tmd/areas/targets/prepare/cd_enhance_basefile_with_special_targets.qmd b/tmd/areas/targets/prepare/cd_enhance_basefile_with_special_targets.qmd index e912d025..eed32750 100644 --- a/tmd/areas/targets/prepare/cd_enhance_basefile_with_special_targets.qmd +++ b/tmd/areas/targets/prepare/cd_enhance_basefile_with_special_targets.qmd @@ -91,6 +91,8 @@ ns(tmd2021) ### Explore SALT-related data +Exploration is interactive so we set eval: false. + ```{r} #| label: salt-explore #| output: false @@ -307,6 +309,7 @@ Pragmatic solution: We will *ASSUME* that total Social Security (e02400) is dist It may be possible at future date to test this assumption, if state-level data have both total and taxable Social Security income. Then we would be able to see how similar the distribution of taxable Social Security income across states is to the distribution of total Social Security income. +Exploration is interactive so we set eval: false. ```{r} #| label: socsec-explore From 3fe791fa6f3f11894c5a057d95f6336d780ad009 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 16:57:16 -0500 Subject: [PATCH 29/33] correct previous failure to save stacked multi-session CD file as csv --- .../cd_create_basefile_multiple_sessions.qmd | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 tmd/areas/targets/prepare/cd_create_basefile_multiple_sessions.qmd diff --git a/tmd/areas/targets/prepare/cd_create_basefile_multiple_sessions.qmd b/tmd/areas/targets/prepare/cd_create_basefile_multiple_sessions.qmd new file mode 100644 index 00000000..9b0f325d --- /dev/null +++ b/tmd/areas/targets/prepare/cd_create_basefile_multiple_sessions.qmd @@ -0,0 +1,56 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +# Create a stacked Congressional district file with boundaries for 117th and 118th Congressional sessions + +## Setup + +```{r} +#| label: setup +#| output: false + +source(here::here("R", "libraries.R")) +source(here::here("R", "constants.R")) +source(here::here("R", "functions.R")) + +# phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") +# cd_create_basefile_multiple_sessions +``` + + +## Get data + +```{r} + + +cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_117.csv")) +cd118 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_118.csv")) + +stack <- bind_rows( + cd117 |> mutate(session=117), + cd118 |> mutate(session=118) +) + +glimpse(stack) +states <- stack |> + summarise(target=sum(target), + .by=c(session, stabbr, src, rectype, + agistub, agilo, agihi, basevname, + scope, fstatus, count, vname, description, agirange)) + +states |> + pivot_wider(names_from = session, + values_from = target, + names_prefix = "s" + ) |> + mutate(diff=s118 - s117, + pdiff=diff / s117) |> + arrange(desc(abs(pdiff))) # good all the state sums work + +write_csv(stack, fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) + +``` + From b4e1653449506353fad31a6048dd2228afd8440a Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 16:57:58 -0500 Subject: [PATCH 30/33] fix order of execution in _quarto.yml --- tmd/areas/targets/prepare/_quarto.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tmd/areas/targets/prepare/_quarto.yml b/tmd/areas/targets/prepare/_quarto.yml index e85be78c..c2009149 100644 --- a/tmd/areas/targets/prepare/_quarto.yml +++ b/tmd/areas/targets/prepare/_quarto.yml @@ -46,9 +46,10 @@ book: - cd_construct_long_soi_data_file.qmd - cd_create_basefile_for_117Congress_cd_target_files.qmd - cd_create_cd_117_118_crosswalk_and_cdbasefile_118.qmd - - cd_enhance_basefile_with_special_targets.qmd + - cd_create_basefile_multiple_sessions.qmd - cd_create_variable_mapping.qmd - # - cd_map_tcvars_and_extract_target_files.qmd + - cd_compare_us_totals_tmd_vs_irs_published.qmd + - cd_enhance_basefile_with_special_targets.qmd appendices: - cd_issues_and_TODOs.qmd - cd_IRS_documentation.qmd From 1cee71fa10f43682c3a370938dff852a1e2b318d Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 17:00:57 -0500 Subject: [PATCH 31/33] delete old unused files --- ...cd_create_basefile_for_cd_target_files.qmd | 270 ------------------ ...eate_crosswalk_from_cd117th_to_cd118th.qmd | 226 --------------- .../prepare/cd_get_census_population.qmd | 94 ------ ...cd_map_tcvars_and_extract_target_files.qmd | 225 --------------- .../prepare/cd_overall_documentation.qmd | 139 --------- .../targets/prepare/cd_write_target_files.qmd | 268 ----------------- 6 files changed, 1222 deletions(-) delete mode 100644 tmd/areas/targets/prepare/cd_create_basefile_for_cd_target_files.qmd delete mode 100644 tmd/areas/targets/prepare/cd_create_crosswalk_from_cd117th_to_cd118th.qmd delete mode 100644 tmd/areas/targets/prepare/cd_get_census_population.qmd delete mode 100644 tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd delete mode 100644 tmd/areas/targets/prepare/cd_overall_documentation.qmd delete mode 100644 tmd/areas/targets/prepare/cd_write_target_files.qmd diff --git a/tmd/areas/targets/prepare/cd_create_basefile_for_cd_target_files.qmd b/tmd/areas/targets/prepare/cd_create_basefile_for_cd_target_files.qmd deleted file mode 100644 index ced4c05b..00000000 --- a/tmd/areas/targets/prepare/cd_create_basefile_for_cd_target_files.qmd +++ /dev/null @@ -1,270 +0,0 @@ ---- -output: html_document -editor_options: - chunk_output_type: console ---- - -# Create base file for final CD target files - -This section creates one long file that is a superset of what we need for individual CD target files. This long file has everything needed to extract and save a target file for any CD. It also has additional convenience variables that will not be included in individual CD target files such as variable descriptions, human-friendly AGI-range labels, state fips codes, and a sort code for ordering records within a CD. - -## Documentation for target files for individual CDs - -### Target file name - -Congressional District target files follow the naming convention **xxxx_targets.csv**, where **xxxx** is a 4 character CD identifier. - -- The first two characters are the state postal abbreviation or, in the case of the District of Columbia, "DC". (DC does not have a voting representative but does have a non-voting member. The SOI data have information for DC and so it is in the data. Thus, we have data for 435 voting districts, plus data for DC.) - -- The next 2 characters identify the Congressional District within the state, with a leading zero. For states that have more than one district, these range from 01 to the number of districts (for example, 53 in the case of California). For the 7 states and DC that have only one CD, these 2 characters are 00, following the SOI convention. - -- Thus, the filename for California's 3rd Congressional District would be CA03_targets.csv and allowable file names would range from CA01_targets.csv to CA53_targets.csv. There is no CA00_targets.csv. The filename for any of the 7 states (or DC) that have only one CD would be WY00_targets.csv. - -### Target file variables - -Each target file will have the following variables: - -- **varname**: This is a PUF-based filename, as used in Tax-Calculator. Thus, examples of allowable names are XTOT (but see below), e00200 (wages), c00100 (AGI, calculated), and e00900. - -- **count**: Indicates whether the target is a count or a dollar amount. Allowable values are 0 for dollar amount and 1 for count. - -- **scope**: Indicates which kinds of records the target applies to. Allowable values are 0 for all records, 1 for tax filers, and 2 for nonfilers. - -- **agilo**, **agihi**: Lower and upper bounds for the AGI range. The interval is of the form \[agilo, agihi) -- that is, it includes all values \>= agilo and \< agihi. - -- **fstatus**: Filing status, following the PUF MARS definition. Allowable values are integers 0-5, where 0 = all records, 1 = single, 2 = married filing joint, 3 = married filing separately, 4 = head of household, and 5 = surviving spouse. **\[?? VERIFY WITH MARTIN\]** - -- **target**: The SOI value (or other target, if the user overrides the SOI value) for this variable, scope, agi range, and filing status. Counts and dollar amounts are "raw" values - neither is scaled to be in thousands or millions, for example. (Because SOI reported dollar values usually are in \$ thousands, we have multipled them by 1,000 so that they are unscaled.) - -### The special first data row of a CD target file - -The area targeting software needs a value for total population in the area. It uses this to scale initial weights prior to optimization so that they sum to the area population. To assist in this, the target file must contain in its first data row a value for the total area population. This special row must have the following values: - -- **varname**: XTOT -- **count**: 0 -- **scope**: 0 -- **agilo**: must be \< -8e99 -- **agihi**: must be \> 8e99 -- **fstatus**: 0 -- **target**: area population - -For example, here is the first data row of an area that has population of 33 million: - -varname,count,scope,agilo,agihi,fstatus,target - -XTOT, 0, 0,-9e99, 9e99, 0, 33e6 - -## Setup - -```{r} -#| label: setup - -source(here::here("R", "libraries.R")) -source(here::here("R", "constants.R")) - -# 334283385.27000004 national pop - -# varname,count,scope,agilo,agihi,fstatus,target -# XTOT, 0, 0,-9e99, 9e99, 0, 33e6 -# e00300, 0, 1,-9e99, 9e99, 0, 20e9 -# e00900, 0, 1,-9e99, 9e99, 0, 30e9 -# e00200, 0, 1,-9e99, 9e99, 0,1000e9 -# e02000, 0, 1,-9e99, 9e99, 0, 30e9 -# e02400, 0, 1,-9e99, 9e99, 0, 60e9 - -``` - -## Get needed data - -```{r} -#| label: get-soi-based-data - -cdlong <- read_csv(fs::path(CDINTERMEDIATE, "cddata_long_clean.csv")) - -``` - - -## Create streamlined long CD-only file - -- Drop non-CD records. -- Drop variables we would never want. -- Construct fstatus 0 records and records with counts by filing status. - -### Drop non-CD records and variables we won't want - -```{r} -#| label: drop-records-and-variables - -cdlong1 <- cdlong |> - filter(rectype %in% c("cd", "cdstate", "DC")) - -# quick data checks -cdlong1 |> filter(AGI_STUB==0) |> count(STATE) -cdlong1 |> filter(AGI_STUB==0) |> select(STATE, CONG_DISTRICT) |> distinct() |> nrow() -count(cdlong1, vtype) - -# winnow data to variables we might consider targeting -# show candidates for dropping in a nice order -cdlong1 |> - filter(is.na(basevname)) |> - select(vname, description) |> - distinct() |> - arrange(vname) - -dropvars <- c("CPREP", "DIR_DEP", "ELDERLY", "ELF", "PREP", "RAC", "TCE", - "TOTAL_VITA", "VITA", "VITA_EIC", "VRTCRIND") - -cdlong2 <- cdlong1 |> - filter(!vname %in% dropvars) - -rm(dropvars) - -``` - - -### Address agi issue - -We have an issue with AGI: for Congressional Districts IRS does NOT report the number of returns with AGI. See the discussion on the introduction page. - -We are going to create new rows for N00100 (Number of returns with AGI (estimated)), equal to N1, add it to the data. - -```{r} -#| label: address-agi - -nagi <- cdlong2 |> - filter(vname == "N1") |> - mutate(vname = "N00100", - basevname = "v00100", - description = "Number of returns with Adjusted Gross Income - AGI (estimated)") - -cdlong3 <- bind_rows(cdlong2, nagi) |> - mutate(basevname = ifelse(vname == "A00100", - "v00100", - basevname)) - -# check -check <- cdlong3 |> - filter(basevname=="v00100", STATE=="WY") |> - arrange(STATE, CONG_DISTRICT, basevname, desc(vtype), AGI_STUB) - -rm(nagi, check) - -``` - - -### Define statecd, fstatus, scope, count; put amounts in dollars; sort - -```{r} -#| label: fstatus-misc - -cdlong4 <- cdlong3 |> - mutate( - fstatus = case_when( - vname == "MARS1" ~ 1, - vname == "MARS2" ~ 2, - vname == "MARS4" ~ 4, # VERIFY WITH MARTIN - .default = 0), - - count = case_when( - vtype == "count" ~ 1, - vtype == "amount" ~ 0, - .default = -99), - - scope = 1, - value = ifelse(vtype == "amount", - value * 1000, - value) - ) - -summary(cdlong4) -# skim(cdlong4) -count(cdlong4, fstatus) -count(cdlong4, count, vtype) -count(cdlong4, scope) - -``` - -### Add Census population records to data - -Prepare the Census population data. - -```{r} -#| label: prepare-census-pop - -# - **varname**: XTOT -# - **count**: 0 -# - **scope**: 0 -# - **agilo**: must be \< -8e99 -# - **agihi**: must be \> 8e99 -# - **fstatus**: 0 -# - **target**: area population - -cdpop <- read_csv(fs::path(CDINTERMEDIATE, "cdpop1year.csv")) -glimpse(cdpop) - -cdpop1 <- cdpop |> - select(STATEFIPS, STATE, CONG_DISTRICT, target=pop2021) |> - mutate(vname="XTOT", - basevname="XTOT", - description = "CD population in 2021 per Census ACS", - AGI_STUB = 0, - agirange = "Total", - agilo = -9e99, - agihi = 9e99, - count = 0, - scope = 0, - fstatus = 0) - -# how well does cdpop merge against the soi data? -soistubs <- cdlong4 |> - select(STATEFIPS, STATE, CONG_DISTRICT, rectype, ndist) |> - distinct() - -fmatch <- soistubs |> - left_join(cdpop1, - by = join_by(STATEFIPS, STATE, CONG_DISTRICT)) - -fmatch |> filter(is.na(target)) # good, we matched on all CDs - -# put rectype and ndist on the cdpop file -cdpop2 <- cdpop1 |> - left_join(fmatch |> - select(STATE, CONG_DISTRICT, rectype, ndist), - by = join_by(STATE, CONG_DISTRICT))# statecd = paste0(STATE, CONG_DISTRICT), - -rm(soistubs, fmatch) - -``` - - -```{r} -#| label: create-cdbasefile - -cdbasefile <- bind_rows(cdlong4 |> - rename(target=value) |> - mutate(src="soi"), - cdpop2 |> mutate(src="census")) |> - mutate(statecd = paste0(STATE, CONG_DISTRICT), - basevname=case_when( - is.na(basevname) & vname=="A00101" ~ "v00101", # special handling - is.na(basevname) ~ vname, - .default = basevname)) |> # THINK ABOUT THIS CAREFULLY - select(src, rectype, stabbr=STATE, cd=CONG_DISTRICT, statecd, - agistub=AGI_STUB, agilo, agihi, basevname, scope, fstatus, count, target, - vname, description, agirange) |> - arrange(statecd, src, scope, fstatus, basevname, count, agistub) - -glimpse(cdbasefile) -summary(cdbasefile) -skim(cdbasefile) - -cdbasefile |> count(basevname) - -cdbasefile |> filter(statecd=="WY00", agistub==0, basevname=="v00100") -cdbasefile |> filter(statecd=="WY00", agistub==0, basevname=="v00101") - -write_csv(cdbasefile, fs::path(CDINTERMEDIATE, "cdbasefile.csv")) - -``` - - diff --git a/tmd/areas/targets/prepare/cd_create_crosswalk_from_cd117th_to_cd118th.qmd b/tmd/areas/targets/prepare/cd_create_crosswalk_from_cd117th_to_cd118th.qmd deleted file mode 100644 index b0196f17..00000000 --- a/tmd/areas/targets/prepare/cd_create_crosswalk_from_cd117th_to_cd118th.qmd +++ /dev/null @@ -1,226 +0,0 @@ ---- -output: html_document -editor_options: - chunk_output_type: console ---- - -# Prepare crosswalk from Congressional district boundaries for the 117th Congressional session to those for the 118th session - -IRS SOI data currently available for Congressional districts is based on 117th Congressional session boundaries, which were drawn using information from the 2010 decennial census. These sometimes differ signficantly from current district boundaries, for the 118th Congressional session, which were drawn based on data from the 2020 decennial census. - -To address this, we develop estimates for 118th session districts by allocating targets for 117th session districts to 118th session districts based on the fraction of each 117th district's 2020 population that is present in different 118th session districts. - -For example, California Congressional district 13 (CA-13) as defined in the 118th session includes 3.6% of the 2020 population of 117th session CA-09, 30.2% of the 2020 population of 117th session CA-10, 54.5% of the population of CA-16, and 13.7% of CA-21. To construct estimated targets for 118th session CA-13, we allocate these percentages of each target of the four 117th session districts (CA-09, CA-10, CA-16, and CA-21) to 118th session CA-13. We repeat this process for every 118th session Congressional district, for all potential targets. - -Doing this requires a crosswalk that shows what fraction of the 2020 population of each 117th district appears in each 118th session district. Fortunately, the Missouri Census Data Center (MCDC) has an online tool, [*Geocorr 2022: Geographic Correspondence Engine*](https://mcdc.missouri.edu/applications/geocorr2022.html), that can create such a crosswalk. On October 15, 2024 we used *Geocorr 2022* to create the crosswalk file "geocorr2022_2428906586.csv", in the cds/raw_data folder of this project. We use this file to construct population-weighted targets for 118th session Congressional districts from the 117th session targets. - -## Setup - -```{r} -#| label: setup -#| output: false - -source(here::here("R", "libraries.R")) -source(here::here("R", "constants.R")) - -# phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") - -``` - - -## Get data -```{r} -#| label: get-data -#| output: false - -cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv")) - -df <- read_csv(fs::path(CDRAW, "geocorr2022_2428906586.csv")) -glimpse(df) - -(xwlabs <- unlist(df[1, ], use.names = TRUE)) # variable names and labels - -xwalk1 <- df |> - filter(row_number() != 1) |> - rename_with(str_to_lower) |> - rename(stabbr=stab, pop2020=pop20) |> - mutate(across(pop2020:afact, as.numeric), - statecd117=paste0(stabbr, cd117), - statecd118=paste0(stabbr, cd118)) |> - rename(af118to117=afact2, - af117to118=afact) - -``` - - -```{r} -#| label: data-checks -#| output: false - -count(xwalk1, stabbr) # 52 including DC and PR -count(xwalk1, stabbr) |> filter(n==1) -xwalk1 |> filter(stabbr=="PR") - -# check numbers of districts -cd117codes <- unique(xwalk1$statecd117) |> sort() # 438 -- why? -cd118codes <- unique(xwalk1$statecd118) |> sort() # 437 -- why? - -cd117codes # DC98 instead of DC00; PR98; NC seems to have bad codes < 10 -cd118codes # DC98 instead of DC00; PR98; NC codes look ok here - -# do the shares of statecd117 given to various statecd118s add to 1? -xwalk1 |> - summarise(af117to118=sum(af117to118), .by=statecd117) |> - filter(af117to118 != 1) |> - arrange(desc(abs(af117to118 - 1))) # minimal differences from 1 - -# do the shares of statecd118 given to various statecd117s add to 1? -xwalk1 |> - summarise(af118to117=sum(af118to117), .by=statecd118) |> - filter(af118to117 != 1) |> - arrange(desc(abs(af118to117 - 1))) # minimal differences from 1 - -# do the individual shares of af117to118 match what we get with population? -xwalk1 |> - mutate(share117to118=pop2020 / sum(pop2020), .by=statecd117) |> - mutate(diff=share117to118 - af117to118) |> - relocate(af117to118, .before=share117to118) |> - arrange(desc(abs(diff))) # good, they match within small tolerances -# use our calculated amounts - -# do the individual shares of af118to117 match what we get with population? -xwalk1 |> - mutate(share118to117=pop2020 / sum(pop2020), .by=statecd118) |> - mutate(diff=share118to117 - af118to117) |> - relocate(af118to117, .before=share118to117) |> - arrange(desc(abs(diff))) # good, they match within small tolerances - -# how well do the cds match against our 117th cd data? -xwalk2 <- xwalk1 |> - filter(stabbr != "PR") |> - filter(cd117 != "-") |> # not sure what this is and pop2020 is only 13 - # redo codes - mutate( - oldcd117 = cd117, - cd117 = case_when(stabbr=="NC" & nchar(cd117) != 2 ~ - str_pad(as.integer(cd117), width=2, side="left", pad="0"), - .default = cd117), - statecd117=paste0(stabbr, cd117), - statecd118=paste0(stabbr, cd118), - statecd117 = case_when( - statecd117 == "DC98" ~ "DC00", - .default = statecd117), - statecd118 = case_when( - statecd118 == "DC98" ~ "DC00", - .default = statecd118)) - -xwalk2 |> - filter(cd117 != oldcd117) |> - relocate(oldcd117, .after=cd117) - -xwalk2 |> - filter(statecd118 == "NC14") - -# how do the 117th CDs match up? -usoi <- cd117$statecd |> unique() -ugeo <- xwalk2$statecd117 |> unique() - -usoi -ugeo - -setdiff(usoi, ugeo) # none missing -setdiff(ugeo, usoi) # none missing - -check <- xwalk2 |> - filter(stabbr=="NC") - -``` - - -```{r} -#| label: make-save-final-xwalk -#| output: false - -# calc pop shares (so we have more precision than in the source data) and save -xwalk3 <- xwalk2 |> - mutate(share117to118=pop2020 / sum(pop2020), .by=statecd117) - -xwalk3 |> - mutate(diff=share117to118 - af117to118) |> - relocate(af117to118, .before=share117to118) |> - arrange(desc(abs(diff))) # good, they match within small tolerances - -xwalk_final <- xwalk3 |> - select(stabbr, cd117, cd118, statecd117, statecd118, share117to118) - -write_csv(xwalk_final, fs::path(CDINTERMEDIATE, "xwalk_final.csv")) - -``` - - -## Create 118th Session Congressional Districts -```{r} -#| label: create-cd118 -#| output: false - -xwalk <- read_csv(fs::path(CDINTERMEDIATE, "xwalk_final.csv")) - -cd118v1 <- xwalk |> - rename(statecd=statecd118, - cd=cd118) |> - left_join(cd117 |> - select(-cd) |> - rename(statecd117=statecd), - by = join_by(stabbr, statecd117), - relationship = "many-to-many") - -# collapse the file to statecd -cd118v2 <- cd118v1 |> - mutate(target=target * share117to118) |> - summarise(target=sum(target), - .by=c(stabbr, cd, statecd, src, rectype, - agistub, agilo, agihi, basevname, - scope, fstatus, count, vname, description, agirange)) - -glimpse(cd118v2) -summary(cd118v2) -write_csv(cd118v2, fs::path(CDINTERMEDIATE, "cdbasefile_118.csv")) - - -``` - - -```{r} - - -cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile.csv")) -cd118 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_118.csv")) - -stack <- bind_rows( - cd117 |> mutate(session="s117"), - cd118 |> mutate(session="s118") -) - -glimpse(stack) -states <- stack |> - summarise(target=sum(target), - .by=c(session, stabbr, src, rectype, - agistub, agilo, agihi, basevname, - scope, fstatus, count, vname, description, agirange)) - -states |> - pivot_wider(names_from = session, - values_from = target) |> - mutate(diff=s118 - s117, - pdiff=diff / s117) |> - arrange(desc(abs(pdiff))) # good all the state sums work - -write_csv(stack, fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) - -``` - - - - - - diff --git a/tmd/areas/targets/prepare/cd_get_census_population.qmd b/tmd/areas/targets/prepare/cd_get_census_population.qmd deleted file mode 100644 index cbec8c94..00000000 --- a/tmd/areas/targets/prepare/cd_get_census_population.qmd +++ /dev/null @@ -1,94 +0,0 @@ ---- -output: html_document -editor_options: - chunk_output_type: console ---- - -# Get Congressional District Census population data - -## Setup - -```{r} -#| label: setup - -source(here::here("R", "libraries.R")) -source(here::here("R", "constants.R")) - -# 334283385.27000004 national pop - -``` - - -## Get Congressional District population - -```{r} -#| label: cdpop-download -#| eval: false - -# Note that 2021 gets data from the 116th Congress and 2022 gets the 218th Congress - -cdpop5year <- get_acs( - geography = "congressional district", - variables = "B01003_001", # Total population variable - year = 2021, - survey = "acs5" # Use 5-year estimates for better coverage, especially in smaller areas -) -write_csv(cdpop5year, fs::path(CDRAW, "cdpop5year_acs.csv")) - -cdpop1year <- get_acs( - geography = "congressional district", - variables = "B01003_001", # Total population variable - year = 2021, - survey = "acs1" # Use 5-year estimates for better coverage, especially in smaller areas -) -write_csv(cdpop1year, fs::path(CDRAW, "cdpop1year_acs.csv")) - -``` - -## Clean Congressional District population - -```{r} -#| label: cdpop-clean - -cdpop1year <- read_csv(fs::path(CDRAW, "cdpop1year_acs.csv")) -cdpop5year <- read_csv(fs::path(CDRAW, "cdpop5year_acs.csv")) - -cdpop1year |> summarise(estimate=sum(estimate)) # 335157329 -cdpop5year |> summarise(estimate=sum(estimate)) # 333036755 - - -stcodes <- tigris::states() |> - as.data.frame() |> - select(STATEFIPS=STATEFP, STATE=STUSPS) - -cdpop1 <- cdpop1year |> - mutate(STATEFIPS = str_sub(GEOID, 1, 2), - CONG_DISTRICT = str_sub(GEOID, 3, 4)) |> - left_join(stcodes, by = join_by(STATEFIPS)) |> - filter(STATE != "PR") |> # we're not using Puerto Rico - mutate(CONG_DISTRICT = ifelse(STATE == "DC", - "00", # Census data has 98 for DC - CONG_DISTRICT)) |> - select(STATEFIPS, STATE, CONG_DISTRICT, cdname=NAME, pop2021=estimate) - -count(cdpop1, STATEFIPS, STATE) - -cdpop1 |> filter(STATE=="NY") - -cdpop1 |> filter(STATE=="AK") -cdpop1 |> filter(STATE=="DC") -cdpop1 |> filter(STATE=="WY") - -count(cdpop1, CONG_DISTRICT) -count(cdpop1, STATEFIPS, STATE) - -glimpse(cdpop1) -sum(cdpop1$pop2021) # 331,893,745 compared to Martin's 334,283,385 - -write_csv(cdpop1, fs::path(CDINTERMEDIATE, "cdpop1year.csv")) - - -``` - - - diff --git a/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd b/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd deleted file mode 100644 index dcba9b19..00000000 --- a/tmd/areas/targets/prepare/cd_map_tcvars_and_extract_target_files.qmd +++ /dev/null @@ -1,225 +0,0 @@ ---- -output: html_document -editor_options: - chunk_output_type: console ---- - -# Map tax calculator vars to soi vars and extract targets - -```{r} -#| label: setup - -source(here::here("R", "libraries.R")) -source(here::here("R", "constants.R")) - -# 334283385.27000004 national pop - - -``` - -## Combine cd117 and cd118 into a stacked cd file - -```{r} -#| label: get-cdbasefile -#| output: false - -cd117 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_117.csv")) -cd118 <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_118.csv")) - -stack <- bind_rows( - - cd117 |> mutate(session=117), - cd118 |> mutate(session=118) -) - -glimpse(stack) - - - -write_csv(stack, fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) - -``` - - -```{r} -#| label: check-state-totals-interactively -#| eval: false -#| output: false - -# check whether state totals are good -stack <- bind_rows( - cd117 |> mutate(session="s117"), - cd118 |> mutate(session="s118") -) - -write_csv(stack, fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) - -glimpse(stack) - -# states <- stack |> -# summarise(target=sum(target), -# .by=c(session, stabbr, src, rectype, -# agistub, agilo, agihi, basevname, -# scope, fstatus, count, vname, description, agirange)) -# -# states |> -# pivot_wider(names_from = session, -# -# values_from = target, -# names_prefix="s") |> -# mutate(diff=s118 - s117, -# pdiff=diff / s117) |> -# arrange(desc(abs(pdiff))) # good all the state sums work -``` - - - -## Get saved variable mapping - -```{r} -#| label: tc-soi-variablemap -#| output: false - -soivars <- count(stack, basevname) -# soivars$basevname - -vmap <- read_csv(fs::path(CDINTERMEDIATE, "cd_variable_mapping.csv")) - -# the MARS mappings let us get counts by filing status by agi range -# vmap <- read_csv(file=" -# tcvar, soivar -# XTOT, XTOT -# c00100, MARS1 -# c00100, MARS2 -# c00100, MARS4 -# c00100, v00100 -# e00200, v00200 -# e00300, v00300 -# e01700, v01700 -# e26270, v26270 -# ") - -``` - -```{r} -#| label: mapped-file -#| output: false - -mapped <- stack |> - filter(basevname %in% vmap$soivar) |> - mutate(varname=factor(basevname, levels=vmap$soivar, labels=vmap$tcvar)) - -# count(mapped, varname, vname) - -``` - -```{r} -#| label: extracts -#| output: false - -# varname,count,scope,agilo,agihi,fstatus,target -# XTOT, 0, 0,-9e99, 9e99, 0, 33e6 -# e00300, 0, 1,-9e99, 9e99, 0, 20e9 - -# define extracts we want - -phase4cds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") -# statecds <- "NY21" - -statecds <- phase4cds -session_number <- 117 - -extracted <- mapped |> - filter(statecd %in% statecds) |> - filter(session==paste0("s", session_number)) |> - arrange(statecd, src, scope, fstatus, basevname, count, agistub) # to be safe - -count(extracted, statecd) -count(extracted, basevname, varname) - -targets <- extracted |> - filter(varname %in% c("XTOT", "c00100", "e00200", "e26270")) |> - filter(varname == "XTOT" | (agistub != 0)) |> - filter(!(count == 1 & (varname != "c00100"))) |> - select(statecd, varname, count, scope, agilo, agihi, fstatus, target) - -check <- targets |> - filter(statecd == "AK00") - -f <- function(data, group){ - cd <- group$statecd |> - str_to_lower() - fname <- paste0(cd, "_targets.csv") - fpath <- fs::path(CDFINAL, fname) - print(fpath) - write_csv(data, fpath) -} - -targets |> - group_by(statecd) |> - group_walk(~f(.x, .y)) - - -# write_csv(targets, fs::path(CDFINAL, "ny21_targets.csv")) - -``` - - -## Documentation for target files for individual CDs - -### Target file name - -Congressional District target files follow the naming convention **xxxx_targets.csv**, where **xxxx** is a 4 character CD identifier. - -- The first two characters are the state postal abbreviation or, in the case of the District of Columbia, "DC". (DC does not have a voting representative but does have a non-voting member. The SOI data have information for DC and so it is in the data. Thus, we have data for 435 voting districts, plus data for DC.) - -- The next 2 characters identify the Congressional District within the state, with a leading zero. For states that have more than one district, these range from 01 to the number of districts (for example, 53 in the case of California). For the 7 states and DC that have only one CD, these 2 characters are 00, following the SOI convention. - -- Thus, the filename for California's 3rd Congressional District would be CA03_targets.csv and allowable file names would range from CA01_targets.csv to CA53_targets.csv. There is no CA00_targets.csv. The filename for any of the 7 states (or DC) that have only one CD would be WY00_targets.csv. - -### Target file variables - -### The special first data row of a CD target file - -The area targeting software needs a value for total population in the area. It uses this to scale initial weights prior to optimization so that they sum to the area population. To assist in this, the target file must contain in its first data row a value for the total area population. This special row must have the following values: - -- **varname**: XTOT -- **count**: 0 -- **scope**: 0 -- **agilo**: must be \< -8e99 -- **agihi**: must be \> 8e99 -- **fstatus**: 0 -- **target**: area population - -For example, here is the first data row of an area that has population of 33 million: - -varname,count,scope,agilo,agihi,fstatus,target - -XTOT, 0, 0,-9e99, 9e99, 0, 33e6 - -For up-to-date documentation of target files, see the associated [README](https://github.com/PSLmodels/tax-microdata-benchmarking/blob/master/tmd/areas/targets/README.md). The following is from the version that was current as of 2024-11-01: - -> An areas targets file is a CSV-formatted file with its first row containing column names and its second row containing the area population target. Each subsequent row contains another target. Rows after the first two that start with a `#` character are considered comments and are skipped. -> -> Here are the column names and their valid values: -> -> 1. **`varname`**: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the `tmd/storage/__init__.py` file -> 2. **`count`**: integer in \[0,4\] range: -> - count==0 implies dollar total of varname is tabulated -> - count==1 implies number of tax units with **any** value of varname is tabulated -> - count==2 implies number of tax units with a **nonzero** value of varname is tabulated -> - count==3 implies number of tax units with a **positive** value of varname is tabulated -> - count==4 implies number of tax units with a **negative** value of varname is tabulated -> 3. **`scope`**: integer in \[0,2\] range: -> - scope==0 implies all tax units are tabulated -> - scope==1 implies only PUF-derived filing units are tabulated -> - scope==2 implies only CPS-derived filing units are tabulated -> 4. **`agilo`**: float representing lower bound of the AGI range (which is included in the range) that is tabulated. -> 5. **`agihi`**: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated. -> 6. **`fstatus`**: integer in \[0,5\] range: -> - fstatus=0 implies all filing statuses are tabulated -> - other fstatus values imply just the tax units with the Tax-Calculator `MARS` variable equal to fstatus are included in the tabulation -> 7. **`target`**: target amount: -> - dollars if count==0 -> - number of tax units if count\>0 - diff --git a/tmd/areas/targets/prepare/cd_overall_documentation.qmd b/tmd/areas/targets/prepare/cd_overall_documentation.qmd deleted file mode 100644 index 48058751..00000000 --- a/tmd/areas/targets/prepare/cd_overall_documentation.qmd +++ /dev/null @@ -1,139 +0,0 @@ ---- -output: html_document -editor_options: - chunk_output_type: console ---- - -# About the data - -This chapter has two sections: - -- IRS documentation: Copied verbatim from the IRS SOI data documentation (21incddocguide.docx), with no substantive edits and no commentary. - -- Comments on the data: Notes about selected issues, quirks, and pitfalls in the data discovered from working with the data. - -## IRS documentation - -All text in this section is a direct quote from IRS documentation. - -### Time period - -The Statistics of Income (SOI) Division’s Congressional district data is tabulated using individual income tax returns (Forms 1040) filed with the Internal Revenue Service (IRS) during the 12-month period, January 1, 2022 to December 31, 2022. While the bulk of returns filed during this 12-month period are primarily for Tax Year 2021, the IRS received a limited number of returns for tax years before 2021. These prior-year returns are used as a proxy for returns that are typically filed beyond the 12-month period and have been included within the congressional district data. - -### Population Definitions and Tax Return Addresses - -- Congressional data are based on the population of individual income tax returns processed by the IRS during the 2022 calendar year. - -- Returns filed for the purpose of receiving an Economic Impact Payment, due to COVID-19, were excluded from the data. - -- State totals within the Congressional data may not be comparable to State totals published elsewhere by SOI because of disclosure protection procedures or the exclusion of returns that did not match based on the ZIP code. See footnote for complete State totals. \[2\] - -- Data do not represent the full U.S. population because many individuals are not required to file an individual income tax return. - -- The address shown on the tax return may differ from the taxpayer’s actual residence. - -- Congressional districts were based on the ZIP code shown on the return. - -- Tax returns filed without a ZIP code and returns filed with a ZIP code that did not match the State code shown on the return were excluded. - -- Tax returns filed using Army Post Office (APO) and Fleet Post Office addresses, foreign addresses, and addresses in Puerto Rico, Guam, Virgin Islands, American Samoa, Marshall Islands, Northern Marianas, and Palau were excluded. - -### Congressional District and ZIP Code Matching Procedures - -SOI uses a commercial file to match ZIP codes to congressional districts. Congressional districts cover the 435 congressional districts in the 50 states and the District of Columbia. District boundaries are based on the 117th Congress. - -The matching process first utilizes the 9-digit ZIP code, if present on the return, to determine the proper congressional district for that return. Nearly 97 percent of the returns match on the 9-digit ZIP code. When the 9-digit ZIP code is not available, the matching process uses the 5-digit ZIP code to determine the proper congressional district. Returns that do not match on ZIP code, or where a ZIP code is not present, are excluded from the data. - -Eight states (AK, DC, DE, MT, ND, SD, VT, and WY) have only one congressional district, therefore the matching procedures are not performed on these states. Returns with only one congressional district represent 2 percent of the total number of returns. - -### Disclosure Protection Procedures - -SOI did not attempt to correct any ZIP codes listed on the tax returns; however, it did take the following precautions to avoid disclosing information about specific taxpayers: - -- Income and tax items with less than 20 returns for a particular AGI class were combined with another AGI class within the same congressional district. Collapsed AGI classes are identified with a double asterisk (\*\*) in the Excel files. - -- Income and tax items with less than 20 returns for a congressional district were excluded. - -- If an income or tax item from one return constitutes more than a specified percentage of the total of any particular cell, the specific data item for that return is excluded from that cell. For example, if the amount for wages from one return represents 75 percent of the value of the total for that cell, the data item will be suppressed. The actual threshold percentage used cannot be released. - -### IRS Endnotes - -[1] The use of prior-year returns as a proxy for returns that are filed beyond the current processing year is consistent with SOI’s national, state, county, and ZIP code tabulations. A description of SOI’s sample, which is used as an input for the geographic data, and the use of prior-year returns, can be found at https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-returns-publication-1304-complete-report#_sec2. - -[2] For complete individual income tax tabulations at the State level, see the historic table posted to Tax Stats at http://www.irs.gov/uac/SOI-Tax-Stats---Historic-Table-2. - -[3] The State Federal Information Processing System (FIPS) codes used for these statistics were derived from the U.S. Census Bureau. A complete list of codes can be obtained from https://www.census.gov/geo/reference/codes/cou.html. - -[4] "Number," here, and elsewhere represents number of returns, unless otherwise specified. The number of returns have been rounded to the nearest ten. - -[5] Beginning in 2018, personal exemption deductions were suspended for the primary, secondary, and dependent taxpayers. However, the data used to create the “Number of individuals”—filing status, dependent status indicator, and identifying dependent information—are still available on the Form 1040. This field is based on these data. - -[6] The "Number of volunteer prepared returns" shows counts of returns prepared by IRS-certified volunteers to taxpayers with limited income, persons with disabilities, limited English speaking taxpayers, current and former members of the military, and taxpayers who are 60 years of age and older. - -[7] These data do not distinguish between refund anticipation loans and refund advances. - -[8] Includes primary taxpayers 60 years of age and older. - -[9] Less deficit. - -[10] “Qualified dividends” are ordinary dividends received in tax years beginning after 2002 that meet certain conditions and receive preferential tax rates. - -[11] Includes the Alaskan permanent fund, reported by residents of Alaska on Forms 1040A and 1040EZ's. -This fund only applies to statistics in the totals, and the state of Alaska. - -[12] “Total tax credits" represent the summation of taxpayer reported credit items made up in the calculation of the total credits line of Form 1040. It does not include the "earned income credit" and "refundable education credit," which are shown separately below in the table. - -[13] Earned income credit includes both the refundable and non-refundable portions. The non-refundable portion could reduce income tax and certain related taxes to zero. The earned income credit amounts in excess of total tax liability, or amounts when there was no tax liability at all, were refundable. See footnote 14 below for explanation of the refundable portion of the earned income credit. - -[14] The refundable portion of the earned income credit equals total income tax minus the earned income credit. If the result is negative, this amount is considered the refundable portion. No other refundable credits were taken into account for this calculation. - -[15] The "refundable education credit" can partially or totally offset tax liability as well as be totally refundable. - -[16] The 2021 recovery rebate credit was created by the American Rescue Plan Act of 2021, Public Law 117-2, 135 Stat. 4 (March 11, 2021). The recovery rebate credit was a credit against income tax for tax year 2021, but the American Rescue Plan Act of 2021 directed the IRS to make advance refunds of the recovery rebate credit “as rapidly as possible,” using information from tax year 2020 returns. - -An advance refund of the 2021 recovery rebate credit made under section 6428B of the Internal Revenue Code (Code), which was added by the American Rescue Plan Act of 2021, is referred to as a third round Economic Impact Payment (EIP). The dollar amount of the 2021 recovery rebate credit received by the taxpayer when they file their 2021 return is equal to a tentative amount, calculated based on the taxpayer’s 2021 return information, which is then reduced (but not below zero) by the amount of the third round EIP received by the taxpayer. The number of returns reported here for the 2021 recovery rebate credit is the number of returns that claimed some portion of the recovery rebate credit on their tax year 2021 return. -(A taxpayer would want to claim the recovery rebate credit if they did not receive the full amount of the credit as an EIP, which could happen, for example, if their income or number of dependent children on their 2021 return was different than on the 2020 return used as the basis for determining their EIP amounts). The amount reported here for the 2021 recovery rebate credit is the amount of the recovery rebate credit claimed by taxpayers on 2021 returns, which does not include the amount received as EIPs. - -[17] This table includes only payments issued to taxpayers who filed tax year 2021 returns. Individuals who received a third round Economic Impact Payment but did not file a 2021 return are excluded from these tabulations. For tabulations that include all recipients of third round EIPs, see: https://www.irs.gov/statistics/soi-tax-stats-coronavirus-aid-relief-and-economic-security-act-cares-act-statistics - - -[18] Section 6428B of the Internal Revenue Code (Code) directed the IRS to use information from tax year 2020 returns to determine eligibility for and the amount of the third round EIP. In contrast, this table is based primarily on information from tax year 2021 returns. Income, location, and household composition may have changed between the 2020 return used for the third round EIP and the 2021 return used for this table. Many taxpayers who would have been ineligible to claim the 2021 recovery rebate credit on their -2021 return because their 2021 income exceeded the phaseout region for the credit were, nevertheless, eligible for a third round EIP on the basis of their 2020 income. Section 6428B of the Code did not include any provisions for otherwise eligible taxpayers who had income in the eligible range in 2020 but whose income exceeded the eligible range in 2021 to pay back any of the third round EIP. For details on eligibility criteria, amount, and phaseout structure of the third round EIP, see: https://www.irs.gov/statistics/soi-tax-stats-coronavirus-aid-relief-and-economic-security-act-cares-act-statistics - -[19] “Total tax liability” differs from “Income tax”, in that “Total tax liability” includes the taxes from recapture of certain prior-year credits, tax applicable to individual retirement arrangements (IRA's), social security taxes on self-employment income and on certain tip income, advanced earned income payments, household employment taxes, and certain other taxes listed in the Form 1040 instructions. - -[20] Reflects payments to or withholdings made to "Total tax liability". This is the amount the tax filer owes when the income tax return is filed. - -[21] The amount of overpayments the tax filer requested to have refunded. - - -## Comments on the data - -### Determining which records are Congressional District records - -- Calculate nstub0 -- number of records by state where AGI_STUB == 0 (the totals record) -- Note that CONG_DISTRICT == "00" is a totals record for the state. There are 8 states that only have 1 CD (see IRS documentation above), and for those states this record doubles as a CD record and as the state record. -- Determine type of record: - - US -- STATE == "US" - - DC -- STATE == "DC" - - state -- nstub0 \> 1 & CONG_DISTRICT == "00" - - cdstate -- nstub0 == 1 (this is both a state record and a CD record, for 8 states) - - cd -- nstub0 \> 1 & CONG_DISTRICT != "00" - -The cd and cdstate records have data for Congressional Districts. There are 435 of these for AGI_STUB == 0 - one for each voting Congressional District (not including the District of Columbia). SOI data also have records for the nonvoting DC district. It is not included in the 435 Congressional Districts . - -The state and cdstate records have data for states. There are 51 of these (4) - -![](images/clipboard-719051713.png) - -To verify that this produces a proper calculation of the number of districts by state, I asked ChatGPT (4o) the following question, and compared the results by state to the calculation above. They are the same. - -> Please give me a table of the number of congressional districts by state (plus the District of Columbia), based on the 117th Congress, ideally as a google sheet or exportable to a spreadsheet. It should have 3 columns: state postal abbreviation, state name, and number of districts. It should add to 435 districts, I believe. - -### Exemptions - -Note that there are no data on exemptions but we do have total number of individuals (N2). - -When run on 2024-10-12 tmd national population was 334,283,385 (\`national_population = (vardf.s006 \* vardf.XTOT).sum()\`). By contrast, the sum of N2 for the U.S. was 289,054,220, or 13.5% less, according to 21incdall.xlsx. - -FWIW, IRS total number of returns in was 160,824,340 per 21in14ar.xls. When run on 2024-10-12 tmd sum of s006 was 184,024,657, or 14.4% more. By contrast, the sum of N1 for the U.S. was 157,375,370, or 2.1% less, according to 21incdall.xlsx. diff --git a/tmd/areas/targets/prepare/cd_write_target_files.qmd b/tmd/areas/targets/prepare/cd_write_target_files.qmd deleted file mode 100644 index 8885eec5..00000000 --- a/tmd/areas/targets/prepare/cd_write_target_files.qmd +++ /dev/null @@ -1,268 +0,0 @@ ---- -output: html_document -editor_options: - chunk_output_type: console ---- - -# Map tax calculator vars to soi vars and extract targets - -```{r} -#| label: setup - -source(here::here("R", "libraries.R")) -source(here::here("R", "constants.R")) - -library(jsonlite) - -``` - - -```{r} -#| label: constants -#| output: false - -phase4cds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") - -``` - - - -```{r} -#| label: json-rules -#| eval: false -#| output: false - -# Data is in name/value pairs -# Data is separated by commas -# Curly braces hold objects -# Square brackets hold arrays - -# In JSON, values must be one of the following data types: -# a string -# a number -# an object -# an array -# a boolean -# null - -# { -# "key": "String", -# "Number": 1, -# "array": [1,2,3], -# "nested": { -# "literals": true -# } -# } - -# https://r4ds.hadley.nz/rectangling.html#json - -``` - - -```{r} -#| label: target-file-rules -#| eval: false -#| output: false - -# count: integer in [0,4] range: -# count==0 implies dollar total of varname is tabulated -# count==1 implies number of tax units with any value of varname is tabulated -# count==2 implies number of tax units with a nonzero value of varname is tabulated -# count==3 implies number of tax units with a positive value of varname is tabulated -# count==4 implies number of tax units with a negative value of varname is tabulated - -# scope: integer in [0,2] range: -# scope==0 implies all tax units are tabulated -# scope==1 implies only PUF-derived filing units are tabulated -# scope==2 implies only CPS-derived filing units are tabulated - -``` - - -## Get target base data - -```{r} -#| label: tc-soi-variablemap -#| output: false - -stack <- read_csv(fs::path(CDINTERMEDIATE, "cdbasefile_sessions.csv")) - -soivars <- count(stack, basevname) -# soivars$basevname - -# the MARS mappings let us get counts by filing status by agi range -vmap <- read_csv(file=" -varname, basevname -XTOT, XTOT -c00100, v00100 -e00200, v00200 -e00300, v00300 -e01700, v01700 -e26270, v26270 -") - -# c00100, MARS1 -# c00100, MARS2 -# c00100, MARS4 - -``` - - -## Get targets recipes - -```{r} -#| label: get-recipe-json -#| eval: true -#| output: false - -# create a csv file that we will left-join with - -fpath <- fs::path(CDFINAL, "cdrecipe.json") -cdrecipe <- read_json(fpath) -names(cdrecipe) - -if(cdrecipe$cdtype == "phase4"){ - cdlist <- phase4cds -} -cdlist - -# varnames <- cdrecipe$targets|> -# map_chr("varname") -# varnames - -``` - - -```{r} - -f <- function(target){ - # for later -- a first step in adding income ranges as a possibility - # if(!"agilo" %in% names(target)) target$agilo <- -9e99 - # if(!"agihi" %in% names(target)) target$agihi <- 9e99 - as_tibble(target) -} - -cdrecipe$targets[[1]] - -names(stack) |> sort() -tmp <- count(stack, basevname, vname, varname) - -targets_tibble <- cdrecipe$targets |> - purrr::map(f) |> - purrr::list_rbind() |> - left_join(vmap, - by = join_by(varname)) |> - mutate(basevname = case_when(fstatus == 1 ~ "MARS1", - fstatus == 2 ~ "MARS2", - fstatus == 4 ~ "MARS4", - .default = basevname)) - -targets_tibble - -stack |> - filter(statecd=="AK00", session==118, basevname=="XTOT") -count(stack, fstatus) - -mapped <- targets_tibble |> - left_join(stack |> - filter(statecd %in% cdlist, - session %in% cdrecipe$session, - !(agistub == 0 & basevname !="XTOT")), - by = join_by(basevname, scope, count, fstatus), - relationship = "many-to-many") |> - mutate(group = case_when(basevname=="XTOT" & scope==0 & count==0 & fstatus==0 ~ 1, - .default = 2)) |> - arrange(statecd, group, varname, scope, count, fstatus, agistub) |> - mutate(sort=row_number(), .by=group) - - -tmp <- mapped |> - filter(statecd=="AK00") - -``` - -```{r} -#| label: write-targets -#| output: false - -# varname,count,scope,agilo,agihi,fstatus,target -# XTOT, 0, 0,-9e99, 9e99, 0, 33e6 -# e00300, 0, 1,-9e99, 9e99, 0, 20e9 - -f <- function(data, group){ - cd <- group$statecd |> - str_to_lower() - fname <- paste0(cd, "_targets.csv") - fpath <- fs::path(CDFINAL, fname) - print(fpath) - write_csv(data, fpath) -} - -mapped |> - select(statecd, varname, count, scope, agilo, agihi, fstatus, target) |> - group_by(statecd) |> - group_walk(~f(.x, .y)) - - -# write_csv(targets, fs::path(CDFINAL, "ny21_targets.csv")) - - -``` - -## Documentation for target files for individual CDs - -### Target file name - -Congressional District target files follow the naming convention **xxxx_targets.csv**, where **xxxx** is a 4 character CD identifier. - -- The first two characters are the state postal abbreviation or, in the case of the District of Columbia, "DC". (DC does not have a voting representative but does have a non-voting member. The SOI data have information for DC and so it is in the data. Thus, we have data for 435 voting districts, plus data for DC.) - -- The next 2 characters identify the Congressional District within the state, with a leading zero. For states that have more than one district, these range from 01 to the number of districts (for example, 53 in the case of California). For the 7 states and DC that have only one CD, these 2 characters are 00, following the SOI convention. - -- Thus, the filename for California's 3rd Congressional District would be CA03_targets.csv and allowable file names would range from CA01_targets.csv to CA53_targets.csv. There is no CA00_targets.csv. The filename for any of the 7 states (or DC) that have only one CD would be WY00_targets.csv. - -### Target file variables - -### The special first data row of a CD target file - -The area targeting software needs a value for total population in the area. It uses this to scale initial weights prior to optimization so that they sum to the area population. To assist in this, the target file must contain in its first data row a value for the total area population. This special row must have the following values: - -- **varname**: XTOT -- **count**: 0 -- **scope**: 0 -- **agilo**: must be \< -8e99 -- **agihi**: must be \> 8e99 -- **fstatus**: 0 -- **target**: area population - -For example, here is the first data row of an area that has population of 33 million: - -varname,count,scope,agilo,agihi,fstatus,target - -XTOT, 0, 0,-9e99, 9e99, 0, 33e6 - -For up-to-date documentation of target files, see the associated [README](https://github.com/PSLmodels/tax-microdata-benchmarking/blob/master/tmd/areas/targets/README.md). The following is from the version that was current as of 2024-11-01: - -> An areas targets file is a CSV-formatted file with its first row containing column names and its second row containing the area population target. Each subsequent row contains another target. Rows after the first two that start with a `#` character are considered comments and are skipped. -> -> Here are the column names and their valid values: -> -> 1. **`varname`**: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the `tmd/storage/__init__.py` file -> 2. **`count`**: integer in \[0,4\] range: -> - count==0 implies dollar total of varname is tabulated -> - count==1 implies number of tax units with **any** value of varname is tabulated -> - count==2 implies number of tax units with a **nonzero** value of varname is tabulated -> - count==3 implies number of tax units with a **positive** value of varname is tabulated -> - count==4 implies number of tax units with a **negative** value of varname is tabulated -> 3. **`scope`**: integer in \[0,2\] range: -> - scope==0 implies all tax units are tabulated -> - scope==1 implies only PUF-derived filing units are tabulated -> - scope==2 implies only CPS-derived filing units are tabulated -> 4. **`agilo`**: float representing lower bound of the AGI range (which is included in the range) that is tabulated. -> 5. **`agihi`**: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated. -> 6. **`fstatus`**: integer in \[0,5\] range: -> - fstatus=0 implies all filing statuses are tabulated -> - other fstatus values imply just the tax units with the Tax-Calculator `MARS` variable equal to fstatus are included in the tabulation -> 7. **`target`**: target amount: -> - dollars if count==0 -> - number of tax units if count\>0 \ No newline at end of file From 05d40ec6107d779062edf87cd3d756a810f5d751 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 17:01:46 -0500 Subject: [PATCH 32/33] fix error in calculating nonzero counts for the comparison analysis --- ...compare_us_totals_tmd_vs_irs_published.qmd | 49 +++++++++++++++++-- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd index f2687ef6..9e6899b2 100644 --- a/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd +++ b/tmd/areas/targets/prepare/cd_compare_us_totals_tmd_vs_irs_published.qmd @@ -31,6 +31,7 @@ fpath <- fs::path(TMDDIR, "cached_allvars.csv") tmd2021 <- vroom(fpath) ns(tmd2021) + ``` ## Create comparison file @@ -95,7 +96,7 @@ tmd2 <- tmd2021 |> as.integer()) |> summarize(across(all_of(variables_to_sum), list(amount = \(x) sum(x * s006), - nzcount = \(x) sum(x!=0 * s006), + nzcount = \(x) sum((x!=0) * s006), allcount= \(x) sum(s006))), .by=c(scope, agistub)) |> arrange(scope, agistub) @@ -105,6 +106,11 @@ tmd2 |> select(scope, agistub, contains("nzcount")) tmd2 |> select(scope, agistub, contains("allcount")) tmd2 |> select(scope, agistub, contains("amount")) +tmd2 |> + filter(scope==1) |> + select(scope, agistub, contains("c00100") & contains("count")) |> + janitor::adorn_totals() + # flip around and get count tmd3 <- tmd2 |> @@ -117,10 +123,9 @@ tmd3 <- tmd2 |> .default = -9e9)) count(tmd3, count, type) -tmd3 |> filter(count==2, agistub==4) # looks good +tmd3 |> filter(count==2, agistub==1) # looks good tmd3 |> filter(count==1, agistub==4) # 37,694,755 is the bad val allcount - # separate the mars and nonmars variables to get proper mars values tmdxmars <- tmd3 |> @@ -184,7 +189,7 @@ tmd_adjusted <- tmd_scopes |> ``` -## Prepare comaparison file +## Prepare comparison file ```{r} #| label: prepare-compare @@ -201,12 +206,45 @@ comp <- tmd_adjusted |> summary(comp) skim(comp) +write_csv(comp, fs::path(CDINTERMEDIATE, "cd_tmd_irs_compare.csv")) + +``` + +## Explore comparisons file + +```{r} +#| label: explore-compare +#| eval: true +#| output: false + +comp <- read_csv(fs::path(CDINTERMEDIATE, "cd_tmd_irs_compare.csv")) + comp |> arrange(desc(abs(pdiff))) badmatches <- c("e18400", "e18500", "e02400") # variables where tmd and IRS concepts are not well aligned + +badmatches <- c("e18400", "e18500", "e02400", "e26270") # to make it easier to examine other variables + +check <- comp |> + filter(!varname %in% badmatches, count==2) |> + arrange(desc(abs(pdiff))) + +tmd2021 |> + filter(data_source==1, c00100 != 0) |> + summarise(n=sum(s006)) # 160850840 + +temp2 <- tmd_adjusted |> filter(fstatus==0, agistub==0, varname=="c00100", scope==1) +temp2 |> gt() |> fmt_number(wtdvalue, decimals=0) +# count 1 161,696,687 +# count 2 160,850,840 + +temp <- comp |> + filter(varname=="c00100", scope==1, agistub==0, count==2) +# target 157375370 value 160850840 + comp |> - filter(!varname %in% badmatches, count==1) |> + filter(!varname %in% badmatches) |> arrange(desc(abs(pdiff))) comp |> @@ -240,5 +278,6 @@ verybad |> # should we create a shared-down variable? # - e00300 taxable interest seems a little off + ``` From 5b386ae4d72500ab5c9a87895ef26266f130dbb8 Mon Sep 17 00:00:00 2001 From: donboyd5 Date: Mon, 18 Nov 2024 17:02:18 -0500 Subject: [PATCH 33/33] targets file with social security and salt --- tmd/areas/targets/ny21_targets.csv | 36 ------------------------------ 1 file changed, 36 deletions(-) diff --git a/tmd/areas/targets/ny21_targets.csv b/tmd/areas/targets/ny21_targets.csv index 85add114..350eb532 100644 --- a/tmd/areas/targets/ny21_targets.csv +++ b/tmd/areas/targets/ny21_targets.csv @@ -1,14 +1,5 @@ varname,count,scope,agilo,agihi,fstatus,target XTOT,0,0,-9e99,9e99,0,778806.9963618957 -c00100,1,1,-9e99,1,0,7783.30061591544 -c00100,1,1,1,10000,0,34845.00467650245 -c00100,1,1,10000,25000,0,58872.665215289446 -c00100,1,1,25000,50000,0,92073.22923991714 -c00100,1,1,50000,75000,0,57122.387347592645 -c00100,1,1,75000,100000,0,36860.296870916754 -c00100,1,1,100000,200000,0,55175.53757452468 -c00100,1,1,200000,500000,0,12676.61287259154 -c00100,1,1,500000,9e99,0,2380.3685403844206 c00100,0,1,1,10000,0,182423342.11336407 c00100,0,1,10000,25000,0,1037335916.1426524 c00100,0,1,25000,50000,0,3369386206.2984276 @@ -17,33 +8,6 @@ c00100,0,1,75000,100000,0,3192887737.1752 c00100,0,1,100000,200000,0,7444391217.009996 c00100,0,1,200000,500000,0,3533125242.646242 c00100,0,1,500000,9e99,0,3512409391.634545 -c00100,1,1,-9e99,1,1,5025.5683979292535 -c00100,1,1,1,10000,1,28500.37375066762 -c00100,1,1,10000,25000,1,40999.78237965522 -c00100,1,1,25000,50000,1,55431.39047078707 -c00100,1,1,50000,75000,1,27828.250423235033 -c00100,1,1,75000,100000,1,11313.918480070117 -c00100,1,1,100000,200000,1,8421.1356665729 -c00100,1,1,200000,500000,1,1666.754799870258 -c00100,1,1,500000,9e99,1,375.3116805477707 -c00100,1,1,-9e99,1,2,1474.965149593628 -c00100,1,1,1,10000,2,3227.4938250458044 -c00100,1,1,10000,25000,2,7784.831857302671 -c00100,1,1,25000,50000,2,17620.627308577044 -c00100,1,1,50000,75000,2,20112.27469996074 -c00100,1,1,75000,100000,2,21629.717785468103 -c00100,1,1,100000,200000,2,43900.36435388441 -c00100,1,1,200000,500000,2,10580.306783344742 -c00100,1,1,500000,9e99,2,1903.525763605093 -c00100,1,1,-9e99,1,4,1041.8053348183391 -c00100,1,1,1,10000,4,2545.6814968580734 -c00100,1,1,10000,25000,4,8850.43355733606 -c00100,1,1,25000,50000,4,16010.696051542369 -c00100,1,1,50000,75000,4,6907.72695791437 -c00100,1,1,75000,100000,4,2875.8964103872336 -c00100,1,1,100000,200000,4,1963.7019740835344 -c00100,1,1,200000,500000,4,297.62913704836967 -c00100,1,1,500000,9e99,4,61.76571232759558 e00200,0,1,-9e99,1,0,28535896.06329254 e00200,0,1,1,10000,0,132417077.07366145 e00200,0,1,10000,25000,0,620330137.749753