diff --git a/tmd/examination/2022/bootstrap_sampling.py b/tmd/examination/2022/bootstrap_sampling.py new file mode 100644 index 00000000..db8ac4c1 --- /dev/null +++ b/tmd/examination/2022/bootstrap_sampling.py @@ -0,0 +1,67 @@ +""" +Generate sample precision estimates using bootstrap sampling methods. +""" + +import sys +import numpy as np +import pandas as pd + +USAGE = "USAGE: python bootstrap_sampling.py tc_dump_output_file_name\n" + +BS_SAMPLES = 1000 +BS_RNSEED = 192837465 +BS_DUMP = False +BS_CI = True + + +def bootstrap_sampling(outfile): + """ + High-level logic of the script. + """ + # read odf from outfile + odf = pd.read_csv(outfile) + print("len(odf) =", len(odf)) + fdf = odf[(odf["data_source"] == 1) & (odf["iitax"] > 0)] + print("len(fdf) =", len(fdf)) + print(f"FILE wght (#M) = {fdf['s006'].sum() * 1e-6:.3f}") + print(f"FILE itax ($B) = {(fdf['s006'] * fdf['iitax']).sum() * 1e-9:.3f}") + + # compute sum of wght and wght*itax for each bootstrap sample + xdf = pd.DataFrame({"wght": fdf["s006"], "itax": fdf["iitax"]}) + rng = np.random.default_rng(seed=BS_RNSEED) + wght_list = [] + itax_list = [] + for bss in range(1, BS_SAMPLES + 1): + bsdf = xdf.sample(n=len(xdf), replace=True, random_state=rng) + wght_value = bsdf["wght"].sum() * 1e-6 + itax_value = (bsdf["wght"] * bsdf["itax"]).sum() * 1e-9 + if BS_DUMP: + print(f"{itax_value:9.3f} {wght_value:7.3f} {bss:4d}") + wght_list.append(wght_value) + itax_list.append(itax_value) + wght = np.sort(np.array(wght_list)) + itax = np.sort(np.array(itax_list)) + print( + f"BS:wght num,mean,stdev = {BS_SAMPLES:4d} " + f"{wght.mean():9.3f} {wght.std():7.3f}" + ) + if BS_CI and BS_SAMPLES == 1000: + print(f"BS:wght median = {wght[499]:9.3f}") + print(f"BS:wght 95%_conf_intv = {wght[24]:9.3f} , {wght[974]:9.3f}") + print( + f"BS:itax num,mean,stdev = {BS_SAMPLES:4d} " + f"{itax.mean():9.3f} {itax.std():7.3f}" + ) + if BS_CI and BS_SAMPLES == 1000: + print(f"BS:itax median = {itax[499]:9.3f}") + print(f"BS:itax 95%_conf_intv = {itax[24]:9.3f} , {itax[974]:9.3f}") + + return 0 + + +if __name__ == "__main__": + if len(sys.argv) - 1 != 1: + sys.stderr.write("ERROR: one command-line argument not specified\n") + sys.stderr.write(USAGE) + sys.exit(1) + sys.exit(bootstrap_sampling(sys.argv[1])) diff --git a/tmd/examination/2022/precision.sh b/tmd/examination/2022/precision.sh new file mode 100755 index 00000000..e0e1da1f --- /dev/null +++ b/tmd/examination/2022/precision.sh @@ -0,0 +1,43 @@ +# The examination/2022/precision.sh script using bootstapping methods to +# estimate the sampling precision of the weighted positive iitax estimate. +# PREREQUISITE: +# (taxcalc-dev) tax-microdata-benchmarking% make test +# USAGE: +# (taxcalc-dev) 2022% ./precision.sh + +# === SETUP === +TMD=../.. +cp $TMD/storage/output/tmd*csv* . +gunzip -f tmd.csv.gz +STATES="ak mn nj nm sc va" + +# === WEIGHTS === +for S in $STATES; do + echo Generating weights for $S ... + unzip -oq phase6-state-targets.zip ${S}_targets.csv + mv -f ${S}_targets.csv $TMD/areas/targets + pushd $TMD/areas > /dev/null + rm -f weights/${S}_tmd_weights.csv.gz + python create_area_weights.py $S > ${S}_local.log + mv -f ${S}_local.log ../examination/2022 + mv -f weights/${S}_tmd_weights.csv.gz ../examination/2022 + popd > /dev/null +done + +# === RESULTS === +cd $TMD/examination/2022 +TC_OPTIONS="--exact --dump --dvars precvars" +echo Generating results for US ... +tc tmd.csv 2022 $TC_OPTIONS | grep -v data +python bootstrap_sampling.py tmd-22-#-#-#.csv +for S in $STATES; do + echo Generating results for $S ... + TMD_AREA=$S tc tmd.csv 2022 $TC_OPTIONS | grep -v data + python bootstrap_sampling.py tmd_${S}-22-#-#-#.csv +done + +# === CLEANUP === +rm -f ./*tmd*csv* +rm -f ./tmd*-22-* +rm -f ./*.log +exit 0 diff --git a/tmd/examination/2022/precvars b/tmd/examination/2022/precvars new file mode 100644 index 00000000..03bcb950 --- /dev/null +++ b/tmd/examination/2022/precvars @@ -0,0 +1,3 @@ +iitax +s006 +data_source