diff --git a/healthcare_data/.gitignore b/data_processing/.gitignore similarity index 100% rename from healthcare_data/.gitignore rename to data_processing/.gitignore diff --git a/healthcare_data/README.md b/data_processing/README.md similarity index 100% rename from healthcare_data/README.md rename to data_processing/README.md diff --git a/healthcare_data/analyses/.gitkeep b/data_processing/analyses/.gitkeep similarity index 100% rename from healthcare_data/analyses/.gitkeep rename to data_processing/analyses/.gitkeep diff --git a/healthcare_data/dbt_project.yml b/data_processing/dbt_project.yml similarity index 100% rename from healthcare_data/dbt_project.yml rename to data_processing/dbt_project.yml diff --git a/healthcare_data/macros/.gitkeep b/data_processing/macros/.gitkeep similarity index 100% rename from healthcare_data/macros/.gitkeep rename to data_processing/macros/.gitkeep diff --git a/healthcare_data/models/generated/with_types/syhdr_commercial_inpatient_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_commercial_inpatient_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_commercial_inpatient_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_commercial_inpatient_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_commercial_outpatient_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_commercial_outpatient_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_commercial_outpatient_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_commercial_outpatient_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_commercial_person_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_commercial_person_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_commercial_person_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_commercial_person_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_commercial_pharmacy_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_commercial_pharmacy_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_commercial_pharmacy_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_commercial_pharmacy_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_inpatient_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_inpatient_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicaid_inpatient_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_inpatient_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_outpatient_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_outpatient_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicaid_outpatient_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_outpatient_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_person_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_person_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicaid_person_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_person_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_pharmacy_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_pharmacy_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicaid_pharmacy_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_pharmacy_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_provider_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_provider_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicaid_provider_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicaid_provider_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_inpatient_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_inpatient_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicare_inpatient_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_inpatient_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_outpatient_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_outpatient_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicare_outpatient_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_outpatient_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_person_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_person_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicare_person_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_person_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_pharmacy_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_pharmacy_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicare_pharmacy_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_pharmacy_2016.sql diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_provider_2016.sql b/data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_provider_2016.sql similarity index 100% rename from healthcare_data/models/generated/with_types/syhdr_medicare_provider_2016.sql rename to data_processing/models/ahrq.gov/generated/with_types/syhdr_medicare_provider_2016.sql diff --git a/data_processing/models/bls.gov/consumer_price_index.sql b/data_processing/models/bls.gov/consumer_price_index.sql new file mode 100644 index 0000000..810ecc3 --- /dev/null +++ b/data_processing/models/bls.gov/consumer_price_index.sql @@ -0,0 +1,6 @@ +{{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} + +SELECT + YEAR AS year, + "AVG" AS consumer_price_index +FROM {{ ref('download_consumer_price_index') }} \ No newline at end of file diff --git a/data_processing/models/bls.gov/download_consumer_price_index.py b/data_processing/models/bls.gov/download_consumer_price_index.py new file mode 100644 index 0000000..da96dee --- /dev/null +++ b/data_processing/models/bls.gov/download_consumer_price_index.py @@ -0,0 +1,13 @@ +import pandas as pd +import os +import requests + +def model(dbt, session): + # URL to the Excel file containing the Consumer Price Index data + output_path = dbt.config.get('output_path') + base_path = os.path.expanduser(output_path) + excel_path = os.path.join(base_path, "r-cpi-u-rs-allitems.xlsx") + # cpi_url = "https://www.bls.gov/cpi/research-series/r-cpi-u-rs-allitems.xlsx" + # download and save to output path as r-cpi-u-rs-allitems.xlsx + consumer_price_index_df = pd.read_excel(excel_path, skiprows=5, usecols=['YEAR', 'AVG']) + return consumer_price_index_df \ No newline at end of file diff --git a/data_processing/models/config.yml b/data_processing/models/config.yml new file mode 100644 index 0000000..c87cca9 --- /dev/null +++ b/data_processing/models/config.yml @@ -0,0 +1,11 @@ +version: 2 + +models: +- name: download_consumer_price_index + config: + data_path: "{{ var('data_path') }}" + output_path: "{{ var('output_path') }}" +- name: ahrq.gov + config: + data_path: "{{ var('data_path') }}" + output_path: "{{ var('output_path') }}" diff --git a/healthcare_data/models/figures/insurance_plan_payment_histogram.sql b/data_processing/models/figures/insurance_plan_payment_histogram.sql similarity index 100% rename from healthcare_data/models/figures/insurance_plan_payment_histogram.sql rename to data_processing/models/figures/insurance_plan_payment_histogram.sql diff --git a/data_processing/models/figures/insurance_plan_payment_histogram_inflation_adjusted.sql b/data_processing/models/figures/insurance_plan_payment_histogram_inflation_adjusted.sql new file mode 100644 index 0000000..b39e1c3 --- /dev/null +++ b/data_processing/models/figures/insurance_plan_payment_histogram_inflation_adjusted.sql @@ -0,0 +1,62 @@ +{{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} + +WITH cpi_adjustment AS ( + SELECT + year, + consumer_price_index + FROM {{ ref('consumer_price_index') }} +), +latest_cpi AS ( + SELECT + MAX(consumer_price_index) AS cpi_2022 + FROM cpi_adjustment + WHERE year = 2022 +), +inflation_adjustment_factors AS ( + SELECT + 2016 AS year, + (lc.cpi_2022 / ca.consumer_price_index) AS adjustment_factor_to_2022 + FROM cpi_adjustment ca + CROSS JOIN latest_cpi lc + WHERE ca.year = 2016 +), +commercial_data AS ( + SELECT + PLAN_PMT_AMT * iaf.adjustment_factor_to_2022 AS Payment, + COUNT(*) AS count, + 'Commercial' AS Insurance + FROM read_parquet('/Users/me/data/syh_dr/syhdr_commercial_inpatient_2016.parquet') cd + JOIN inflation_adjustment_factors iaf ON 1 = 1 + GROUP BY PLAN_PMT_AMT, iaf.adjustment_factor_to_2022 +), +medicaid_data AS ( + SELECT + PLAN_PMT_AMT * iaf.adjustment_factor_to_2022 AS Payment, + COUNT(*) AS count, + 'Medicaid' AS Insurance + FROM read_parquet('/Users/me/data/syh_dr/syhdr_medicaid_inpatient_2016.parquet') md + JOIN inflation_adjustment_factors iaf ON 1 = 1 + GROUP BY PLAN_PMT_AMT, iaf.adjustment_factor_to_2022 +), +medicare_data AS ( + SELECT + PLAN_PMT_AMT * iaf.adjustment_factor_to_2022 AS Payment, + COUNT(*) AS count, + 'Medicare' AS Insurance + FROM read_parquet('/Users/me/data/syh_dr/syhdr_medicare_inpatient_2016.parquet') mcd + JOIN inflation_adjustment_factors iaf ON 1 = 1 + GROUP BY PLAN_PMT_AMT, iaf.adjustment_factor_to_2022 +), +combined_data AS ( + SELECT * FROM commercial_data + UNION ALL + SELECT * FROM medicaid_data + UNION ALL + SELECT * FROM medicare_data +) +SELECT + Payment, + count, + Insurance +FROM combined_data +ORDER BY Insurance, Payment \ No newline at end of file diff --git a/healthcare_data/scripts/generate_syh_dr_data_models.py b/data_processing/scripts/generate_syh_dr_data_models.py similarity index 100% rename from healthcare_data/scripts/generate_syh_dr_data_models.py rename to data_processing/scripts/generate_syh_dr_data_models.py diff --git a/healthcare_data/seeds/.gitkeep b/data_processing/seeds/.gitkeep similarity index 100% rename from healthcare_data/seeds/.gitkeep rename to data_processing/seeds/.gitkeep diff --git a/healthcare_data/snapshots/.gitkeep b/data_processing/snapshots/.gitkeep similarity index 100% rename from healthcare_data/snapshots/.gitkeep rename to data_processing/snapshots/.gitkeep diff --git a/healthcare_data/tests/.gitkeep b/data_processing/tests/.gitkeep similarity index 100% rename from healthcare_data/tests/.gitkeep rename to data_processing/tests/.gitkeep diff --git a/docs/data/insurance_plan_payment_histogram.parquet b/docs/data/insurance_plan_payment_histogram.parquet index 3c4de53..5f168df 100644 Binary files a/docs/data/insurance_plan_payment_histogram.parquet and b/docs/data/insurance_plan_payment_histogram.parquet differ diff --git a/docs/index.md b/docs/index.md index 714edf8..ecdc39f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -79,7 +79,7 @@ function paymentChart(paymentData, width) { return Plot.plot({ width, marginLeft: 60, - x: { type: "log", domain: [1, 1000000] }, // Set the domain of the x-axis to be fixed between 1 and 1,000,000 + x: { type: "log", domain: [100, 300_000] }, // Set the domain of the x-axis to be fixed between 1 and 1,000,000 y: { axis: null }, // Hide the y-axis color: { legend: "swatches", columns: 1, domain: orderInsurance }, marks: [ @@ -93,9 +93,9 @@ function paymentChart(paymentData, width) { fill: "Insurance", order: orderInsurance, thresholds: d3 - .ticks(Math.log10(1), Math.log10(1000000), 40) - .map((d) => +(10 ** d).toPrecision(3)), - tip: true, + .ticks(Math.log10(1), Math.log10(1000000), 90) + .map((d) => 10 ** d), + tip: { format: { x: ",.3r" } } } ) ), diff --git a/healthcare_data/models/config.yml b/healthcare_data/models/config.yml deleted file mode 100644 index 6cb1694..0000000 --- a/healthcare_data/models/config.yml +++ /dev/null @@ -1,7 +0,0 @@ -version: 2 - -models: -- name: healthcare_data - config: - public_use_microdata_sample_url: "{{ var('data_path') }}" - output_path: "{{ var('output_path') }}" diff --git a/requirements.in b/requirements.in index eb10336..4bcd445 100644 --- a/requirements.in +++ b/requirements.in @@ -11,4 +11,6 @@ ipykernel jupysql pip-tools duckdb-engine -pdfplumber \ No newline at end of file +pdfplumber +openpyxl +pip-tools \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 79a5234..2f43871 100644 --- a/requirements.txt +++ b/requirements.txt @@ -80,6 +80,8 @@ duckdb==0.10.1 # duckdb-engine duckdb-engine==0.11.2 # via -r requirements.in +et-xmlfile==1.1.0 + # via openpyxl executing==2.0.1 # via stack-data fonttools==4.51.0 @@ -165,6 +167,8 @@ numpy==1.26.4 # pandas # pyarrow # seaborn +openpyxl==3.1.2 + # via -r requirements.in packaging==24.0 # via # build