diff --git a/tmd/areas/chisquare_test.py b/tmd/areas/chisquare_test.py new file mode 100644 index 00000000..06a8b118 --- /dev/null +++ b/tmd/areas/chisquare_test.py @@ -0,0 +1,228 @@ +""" +Conduct chi-square test of the similarity of two distributions of weights by +income tax liability categories for the same area that are generated using +different targets. The two area weights files are assumed to be located in +the tmd/areas/weights folder. +All test output is written to stdout; error messages are written to stderr. + +For background information on the chi-square test that compares two +categorical (that is, binned) data sets, see Section 14.3: Are Two +Distributions Different? in Press, et al., Numerical Recipies in C: +The Art of Scientific Computing, Second Edition (Cambridge University +Press, 1992). This script uses the chi2_contingency function in the +Python scipy package, which is documented at the following URL: +https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html + +Critical chi-square statistic values (say, at p=0.05) for large +degrees-of-freedom values (which equals numbins-1 in this test) +are available from the following online calculator: +https://www.danielsoper.com/statcalc/calculator.aspx?id=12 +For example: Xsq(p=0.05,dof= 100)= 124.34 + Xsq(p=0.05,dof= 200)= 233.99 + Xsq(p=0.05,dof=1000)= 1074.68 + Xsq(p=0.05,dof=2000)= 2105.15 + +USAGE: python chisquare_test.py WGHT1 WGHT2 [numbins] [dump] + +EXAMPLE using default numbins (equal to 100) and with no details dump: +areas% python chisquare_test.py pa08 pa08A +""" + +import os +import sys +from pathlib import Path +import numpy as np +import pandas as pd +from scipy.stats import chi2_contingency +from tmd.areas import AREAS_FOLDER +from tmd.storage import STORAGE_FOLDER + +USAGE = "USAGE: python chisquare_test.py WGHT1 WGHT2 [numbins] [dump]" +CACHED_ITAX_PATH = STORAGE_FOLDER / "output" / "cached_iitax.npy" +TAX_YEAR = 2021 +MINIMUM_NUM_BINS = 50 +DEFAULT_NUM_BINS = 200 +ITXBINS_DEFINED_USING_AREA_WEIGHTS = False # default is using national weights + + +def check_arguments(): + """ + Check validity of arguments and return the following tuple: + (wname1, wpath1, wname2, wpath2, numbins, dump) + containing name of and path to the WGHT1 and WGHT2 weights files, + plus requested number of income tax categories (or bins) and + whether or not detailed dump information is included in output. + """ + numargs = len(sys.argv) - 1 + if not 2 <= numargs <= 4: + sys.stderr.write( + f"ERROR: number of arguments not in [2,4] range\n{USAGE}\n" + ) + sys.exit(1) + all_ok = True + wname1 = sys.argv[1] + wpath1 = AREAS_FOLDER / "weights" / f"{wname1}_tmd_weights.csv.gz" + if not wpath1.exists(): + sys.stderr.write(f"ERROR: WGHT1 {str(wpath1)} file does not exist\n") + all_ok = False + wname2 = sys.argv[2] + wpath2 = AREAS_FOLDER / "weights" / f"{wname2}_tmd_weights.csv.gz" + if not wpath2.exists(): + sys.stderr.write(f"ERROR: WGHT2 {str(wpath2)} file does not exist\n") + all_ok = False + if not CACHED_ITAX_PATH.exists(): + sys.stderr.write( + f"ERROR: {str(CACHED_ITAX_PATH)} file does not exist\n" + ) + all_ok = False + numbins = DEFAULT_NUM_BINS + if numargs >= 3: + numbins = int(sys.argv[3]) + if numbins < MINIMUM_NUM_BINS: + sys.stderr.write( + f"ERROR: numbins must be no less than {MINIMUM_NUM_BINS}\n" + ) + all_ok = False + dump = False + if numargs >= 4: + if sys.argv[4] == "dump": + dump = True + else: + sys.stderr.write("ERROR: optional fourth argument must be dump\n") + all_ok = False + if not all_ok: + sys.stderr.write(f"{USAGE}\n") + sys.exit(1) + return (wname1, wpath1, wname2, wpath2, numbins, dump) + + +def weights_array(wghts_path: Path): + """ + Return array containing TAX_YEAR weights in area weights file with wpath. + """ + wdf = pd.read_csv(wghts_path) + warray = wdf[f"WT{TAX_YEAR}"] + return warray + + +def weighted_qcut_variable(values, weights, numbins): + """ + Return weighted quantile bin variable for specified values. + """ + quantiles = np.linspace(0, 1, numbins + 1) + order = weights.iloc[values.argsort()].cumsum() + bins = pd.cut(order / order.iloc[-1], quantiles, right=True, labels=False) + return bins.sort_index() + + +def sorted_vdf_with_itxbin(vdf: pd.DataFrame, numbins: int): + """ + Expects vdf to include two area weights arrays and an itax array. + Returns itax-sorted vdf with added itxbin given NUM_BINS environ variable. + """ + # check contents of vdf + assert "wght1" in vdf, "wght1 variable not in vdf" + assert "wght2" in vdf, "wght2 variable not in vdf" + assert "itax" in vdf, "itax variable not in vdf" + # add itxbin variable to vdf + if ITXBINS_DEFINED_USING_AREA_WEIGHTS: + wght = 0.5 * (vdf.wght1 + vdf.wght2) + else: # using national weights + wght = weights_array(STORAGE_FOLDER / "output" / "tmd_weights.csv.gz") + vdf["itxbin"] = weighted_qcut_variable(vdf.itax, wght, numbins) + # return sorted vdf + vdf.sort_values("itax", inplace=True) + return vdf + + +# -- High-level logic of the script: + + +def main( + wname1: str, + wpath1: Path, + wname2: str, + wpath2: Path, + numbins: int, + dump: bool, +): + """ + Conduct chi-square two-variable test using the WGHT1 weights (wpath1) + and the WGHT2 weights (wpath2), which are weights for the same area + generated using different targets, using specified income tax numbins. + """ + # read cached iitax amounts, which are the same for each area weights file + itax = np.load(CACHED_ITAX_PATH) + + # construct vdf DataFrame and sort it and add itxbin variable to it + wght1 = weights_array(wpath1) + wght2 = weights_array(wpath2) + assert len(wght1) == len(wght2) == len(itax) + vdf = pd.DataFrame({"wght1": wght1, "wght2": wght2, "itax": itax}) + vdf = sorted_vdf_with_itxbin(vdf, numbins) + if dump: + print("***** vdf=\n", vdf) + pct = np.linspace(0.1, 0.9, num=9) + print("***** vdf.describe=\n", vdf.describe(percentiles=pct)) + + # prepare frequency data for chi-square test + unweighted_count = len(vdf) + wght1_sum = wght1.sum() + wght2_sum = wght2.sum() + if dump: + print(f"***** {wname1:>5}_weight_total= {wght1_sum:.5f}") + print(f"***** {wname2:>5}_weight_total= {wght2_sum:.5f}") + ratio = wght2_sum / wght1_sum + print(f"***** ==> ratio= {ratio:.8f}") + gvdf = vdf.groupby(by=["itxbin"], sort=False, observed=True) + if dump: + itxtop = gvdf["itax"].max() + print("***** itax_at_bin_top=\n", itxtop) + freq1 = (unweighted_count / wght1_sum) * gvdf["wght1"].sum() + freq2 = (unweighted_count / wght2_sum) * gvdf["wght2"].sum() + if dump: + fdf = pd.DataFrame({"freq1": freq1, "freq2": freq2}) + pct = np.linspace(0.1, 0.9, num=9) + print("***** freq.describe=\n", fdf.describe(percentiles=pct)) + assert len(freq1) == len(freq2) + assert np.allclose([freq1.sum()], [freq2.sum()]) + witax1 = (wght1 * itax).sum() * 1e-9 + witax2 = (wght2 * itax).sum() * 1e-9 + witax_ratio = witax2 / witax1 + print( + f"{wname1},{wname2}_weighted_iitax($B)= {witax1:.3f} {witax2:.3f}" + f" ===> ratio= {witax_ratio:.4f}" + ) + min_bin_cnt = min(np.min(freq1), np.min(freq2)) + print(f"numbins,minimum_bin_freqency_count= {numbins} {min_bin_cnt:.1f}") + if min_bin_cnt < 5: + if numbins > MINIMUM_NUM_BINS: + print("WARNING: reduce value of numbins command-line argument") + print(" to get minimum_bin_fredquency_count above five") + else: + print("WARNING: minimum_bin_fredquency_count is below five") + print(" so use TMD_AREA_ITXBINS=1 environment variable") + print(USAGE) + return 1 + + # conduct chi-square test + print("Conducting chi-square two-variable independence test:") + ctable = pd.DataFrame({"1": freq1, "2": freq2}) + res = chi2_contingency(ctable) + print(f"Xsq_statistic= {res.statistic:.3f}") + print(f"Xsq_test_pval= {res.pvalue:.3f} where dof= {res.dof}") + if dump: + print("***** Xsq(pval=0.05,dof= 100)= 124.34") + print("***** Xsq(pval=0.05,dof= 200)= 233.99") + print("***** Xsq(pval=0.05,dof=1000)= 1074.68") + print("***** Xsq(pval=0.05,dof=2000)= 2105.15") + + return 0 + + +if __name__ == "__main__": + if "TMD_AREA_ITXBINS" in os.environ: + ITXBINS_DEFINED_USING_AREA_WEIGHTS = True + awname1, awpath1, awname2, awpath2, num_bins, dump_ = check_arguments() + RCODE = main(awname1, awpath1, awname2, awpath2, num_bins, dump_) + sys.exit(RCODE) diff --git a/tmd/areas/create_area_weights.py b/tmd/areas/create_area_weights.py index 57d89cab..76baa325 100644 --- a/tmd/areas/create_area_weights.py +++ b/tmd/areas/create_area_weights.py @@ -5,7 +5,8 @@ AREA prefix for state areas are the two lower-case character postal codes. AREA prefix for congressional districts are the state prefix followed by two digits (with a leading zero) identifying the district. States with -only one congressional district have 00 as the two digits. +only one congressional district have 00 as the two digits to align names +with IRS data. """ import sys @@ -47,6 +48,11 @@ OPTIMIZE_IPRINT = 0 # 20 is a good diagnostic value; set to 0 for production OPTIMIZE_RESULTS = False # set to True to see complete optimization results +# assumption about which Congress the congressional districts are defined for: +# ... 2010 implies districts for the 117th Congress +# ... 2020 implies districts for the 118th Congress +CD_CENSUS_YEAR = 2010 + def valid_area(area: str): """