-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #348 from PSLmodels/precision
Add tmd/examination/2022/precision.sh script and associated files
- Loading branch information
Showing
3 changed files
with
113 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
""" | ||
Generate sample precision estimates using bootstrap sampling methods. | ||
""" | ||
|
||
import sys | ||
import numpy as np | ||
import pandas as pd | ||
|
||
USAGE = "USAGE: python bootstrap_sampling.py tc_dump_output_file_name\n" | ||
|
||
BS_SAMPLES = 1000 | ||
BS_RNSEED = 192837465 | ||
BS_DUMP = False | ||
BS_CI = True | ||
|
||
|
||
def bootstrap_sampling(outfile): | ||
""" | ||
High-level logic of the script. | ||
""" | ||
# read odf from outfile | ||
odf = pd.read_csv(outfile) | ||
print("len(odf) =", len(odf)) | ||
fdf = odf[(odf["data_source"] == 1) & (odf["iitax"] > 0)] | ||
print("len(fdf) =", len(fdf)) | ||
print(f"FILE wght (#M) = {fdf['s006'].sum() * 1e-6:.3f}") | ||
print(f"FILE itax ($B) = {(fdf['s006'] * fdf['iitax']).sum() * 1e-9:.3f}") | ||
|
||
# compute sum of wght and wght*itax for each bootstrap sample | ||
xdf = pd.DataFrame({"wght": fdf["s006"], "itax": fdf["iitax"]}) | ||
rng = np.random.default_rng(seed=BS_RNSEED) | ||
wght_list = [] | ||
itax_list = [] | ||
for bss in range(1, BS_SAMPLES + 1): | ||
bsdf = xdf.sample(n=len(xdf), replace=True, random_state=rng) | ||
wght_value = bsdf["wght"].sum() * 1e-6 | ||
itax_value = (bsdf["wght"] * bsdf["itax"]).sum() * 1e-9 | ||
if BS_DUMP: | ||
print(f"{itax_value:9.3f} {wght_value:7.3f} {bss:4d}") | ||
wght_list.append(wght_value) | ||
itax_list.append(itax_value) | ||
wght = np.sort(np.array(wght_list)) | ||
itax = np.sort(np.array(itax_list)) | ||
print( | ||
f"BS:wght num,mean,stdev = {BS_SAMPLES:4d} " | ||
f"{wght.mean():9.3f} {wght.std():7.3f}" | ||
) | ||
if BS_CI and BS_SAMPLES == 1000: | ||
print(f"BS:wght median = {wght[499]:9.3f}") | ||
print(f"BS:wght 95%_conf_intv = {wght[24]:9.3f} , {wght[974]:9.3f}") | ||
print( | ||
f"BS:itax num,mean,stdev = {BS_SAMPLES:4d} " | ||
f"{itax.mean():9.3f} {itax.std():7.3f}" | ||
) | ||
if BS_CI and BS_SAMPLES == 1000: | ||
print(f"BS:itax median = {itax[499]:9.3f}") | ||
print(f"BS:itax 95%_conf_intv = {itax[24]:9.3f} , {itax[974]:9.3f}") | ||
|
||
return 0 | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) - 1 != 1: | ||
sys.stderr.write("ERROR: one command-line argument not specified\n") | ||
sys.stderr.write(USAGE) | ||
sys.exit(1) | ||
sys.exit(bootstrap_sampling(sys.argv[1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# The examination/2022/precision.sh script using bootstapping methods to | ||
# estimate the sampling precision of the weighted positive iitax estimate. | ||
# PREREQUISITE: | ||
# (taxcalc-dev) tax-microdata-benchmarking% make test | ||
# USAGE: | ||
# (taxcalc-dev) 2022% ./precision.sh | ||
|
||
# === SETUP === | ||
TMD=../.. | ||
cp $TMD/storage/output/tmd*csv* . | ||
gunzip -f tmd.csv.gz | ||
STATES="ak mn nj nm sc va" | ||
|
||
# === WEIGHTS === | ||
for S in $STATES; do | ||
echo Generating weights for $S ... | ||
unzip -oq phase6-state-targets.zip ${S}_targets.csv | ||
mv -f ${S}_targets.csv $TMD/areas/targets | ||
pushd $TMD/areas > /dev/null | ||
rm -f weights/${S}_tmd_weights.csv.gz | ||
python create_area_weights.py $S > ${S}_local.log | ||
mv -f ${S}_local.log ../examination/2022 | ||
mv -f weights/${S}_tmd_weights.csv.gz ../examination/2022 | ||
popd > /dev/null | ||
done | ||
|
||
# === RESULTS === | ||
cd $TMD/examination/2022 | ||
TC_OPTIONS="--exact --dump --dvars precvars" | ||
echo Generating results for US ... | ||
tc tmd.csv 2022 $TC_OPTIONS | grep -v data | ||
python bootstrap_sampling.py tmd-22-#-#-#.csv | ||
for S in $STATES; do | ||
echo Generating results for $S ... | ||
TMD_AREA=$S tc tmd.csv 2022 $TC_OPTIONS | grep -v data | ||
python bootstrap_sampling.py tmd_${S}-22-#-#-#.csv | ||
done | ||
|
||
# === CLEANUP === | ||
rm -f ./*tmd*csv* | ||
rm -f ./tmd*-22-* | ||
rm -f ./*.log | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
iitax | ||
s006 | ||
data_source |