From 6dc81b75277f349d112bccc0a8db61d9b2240c4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaan=20L=C4=B1?= Date: Sat, 6 Apr 2024 19:55:53 -0400 Subject: [PATCH] wip: debug syh dr data model --- README.md | 16 +++++-- .../insurance_plan_payment_histogram.sql | 39 +++++++++++++++ .../syhdr_commercial_inpatient_2016.sql | 18 +++---- .../syhdr_commercial_outpatient_2016.sql | 18 +++---- .../syhdr_commercial_person_2016.sql | 16 +++---- .../syhdr_commercial_pharmacy_2016.sql | 14 +++--- .../syhdr_medicaid_inpatient_2016.sql | 18 +++---- .../syhdr_medicaid_outpatient_2016.sql | 18 +++---- .../with_types/syhdr_medicaid_person_2016.sql | 24 +++++----- .../syhdr_medicaid_pharmacy_2016.sql | 14 +++--- .../syhdr_medicare_inpatient_2016.sql | 18 +++---- .../syhdr_medicare_outpatient_2016.sql | 18 +++---- .../with_types/syhdr_medicare_person_2016.sql | 20 ++++---- .../syhdr_medicare_pharmacy_2016.sql | 14 +++--- .../scripts/generate_syh_dr_data_models.py | 48 +++++++++++-------- 15 files changed, 185 insertions(+), 128 deletions(-) create mode 100644 healthcare_data/models/figures/insurance_plan_payment_histogram.sql diff --git a/README.md b/README.md index ac41978..ba82bf3 100644 --- a/README.md +++ b/README.md @@ -100,10 +100,10 @@ cd healthcare_data python scripts/generate_syh_dr_data_models.py ~/data/syh_dr https://www.ahrq.gov/sites/default/files/wysiwyg/data/SyH-DR-Codebook.pdf ``` -2. Generate the synthetic healthcare data (takes ~5 minutes): +2. Generate the synthetic healthcare data (takes ~2.5 minutes, with 8 threads on a Macbook): ```bash -dbt run +dbt run --threads 8 ``` 3. Verify that you can query the data on the command line: @@ -111,10 +111,18 @@ dbt run ```bash -## To build a specific data model: +## To build a specific data model Use `--select` in dbt: ```bash -syhdr_medicare_outpatient_2016 +dbt run --select "syhdr_medicare_outpatient_2016" +``` + +## To build a specific figure for visualization with Observable Framework + +Use `--select` in dbt to select models, e.g. in order to build all histograms: + +```bash +dbt run --select "*histogram*" ``` \ No newline at end of file diff --git a/healthcare_data/models/figures/insurance_plan_payment_histogram.sql b/healthcare_data/models/figures/insurance_plan_payment_histogram.sql new file mode 100644 index 0000000..e649847 --- /dev/null +++ b/healthcare_data/models/figures/insurance_plan_payment_histogram.sql @@ -0,0 +1,39 @@ +{{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} + +WITH commercial_data AS ( + SELECT + PLAN_PMT_AMT AS Payment, + COUNT(*) AS count, + 'Commercial' AS Insurance + FROM read_parquet('/Users/me/data/syh_dr/syhdr_commercial_inpatient_2016.parquet') + GROUP BY PLAN_PMT_AMT +), +medicaid_data AS ( + SELECT + PLAN_PMT_AMT AS Payment, + COUNT(*) AS count, + 'Medicaid' AS Insurance + FROM read_parquet('/Users/me/data/syh_dr/syhdr_medicaid_inpatient_2016.parquet') + GROUP BY PLAN_PMT_AMT +), +medicare_data AS ( + SELECT + PLAN_PMT_AMT AS Payment, + COUNT(*) AS count, + 'Medicare' AS Insurance + FROM read_parquet('/Users/me/data/syh_dr/syhdr_medicare_inpatient_2016.parquet') + GROUP BY PLAN_PMT_AMT +), +combined_data AS ( + SELECT * FROM commercial_data + UNION ALL + SELECT * FROM medicaid_data + UNION ALL + SELECT * FROM medicare_data +) +SELECT + Payment, + count, + Insurance +FROM combined_data +ORDER BY Insurance, Payment \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_commercial_inpatient_2016.sql b/healthcare_data/models/generated/with_types/syhdr_commercial_inpatient_2016.sql index aa67ad2..6155748 100644 --- a/healthcare_data/models/generated/with_types/syhdr_commercial_inpatient_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_commercial_inpatient_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - FACILITY_ID::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + FACILITY_ID::UBIGINT, + CLM_CNTL_NUM::NUMERIC, AT_SPCLTY::VARCHAR, - SRVC_BEG_DATE::VARCHAR, - SRVC_END_DATE::VARCHAR, - LOS::VARCHAR, + SRVC_BEG_DATE::DATE, + SRVC_END_DATE::DATE, + LOS::UINTEGER, ADMSN_TYPE::VARCHAR, TOB_CD::VARCHAR, CLM_TYPE_CD::VARCHAR, @@ -101,6 +101,6 @@ SELECT CPT_PRCDR_CD_33::VARCHAR, CPT_PRCDR_CD_34::VARCHAR, CPT_PRCDR_CD_35::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_commercial_inpatient_2016.CSV', header=True, null_padding=true, types={'CPT_PRCDR_CD_1': 'VARCHAR', 'CPT_PRCDR_CD_2': 'VARCHAR', 'CPT_PRCDR_CD_3': 'VARCHAR', 'CPT_PRCDR_CD_4': 'VARCHAR', 'CPT_PRCDR_CD_5': 'VARCHAR', 'CPT_PRCDR_CD_6': 'VARCHAR', 'CPT_PRCDR_CD_7': 'VARCHAR', 'CPT_PRCDR_CD_8': 'VARCHAR', 'CPT_PRCDR_CD_9': 'VARCHAR', 'CPT_PRCDR_CD_10': 'VARCHAR', 'CPT_PRCDR_CD_11': 'VARCHAR', 'CPT_PRCDR_CD_12': 'VARCHAR', 'CPT_PRCDR_CD_13': 'VARCHAR', 'CPT_PRCDR_CD_14': 'VARCHAR', 'CPT_PRCDR_CD_15': 'VARCHAR', 'CPT_PRCDR_CD_16': 'VARCHAR', 'CPT_PRCDR_CD_17': 'VARCHAR', 'CPT_PRCDR_CD_18': 'VARCHAR', 'CPT_PRCDR_CD_19': 'VARCHAR', 'CPT_PRCDR_CD_20': 'VARCHAR', 'CPT_PRCDR_CD_21': 'VARCHAR', 'CPT_PRCDR_CD_22': 'VARCHAR', 'CPT_PRCDR_CD_23': 'VARCHAR', 'CPT_PRCDR_CD_24': 'VARCHAR', 'CPT_PRCDR_CD_25': 'VARCHAR', 'CPT_PRCDR_CD_26': 'VARCHAR', 'CPT_PRCDR_CD_27': 'VARCHAR', 'CPT_PRCDR_CD_28': 'VARCHAR', 'CPT_PRCDR_CD_29': 'VARCHAR', 'CPT_PRCDR_CD_30': 'VARCHAR', 'CPT_PRCDR_CD_31': 'VARCHAR', 'CPT_PRCDR_CD_32': 'VARCHAR', 'CPT_PRCDR_CD_33': 'VARCHAR', 'CPT_PRCDR_CD_34': 'VARCHAR', 'CPT_PRCDR_CD_35': 'VARCHAR'}, ignore_errors=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_commercial_outpatient_2016.sql b/healthcare_data/models/generated/with_types/syhdr_commercial_outpatient_2016.sql index fd3e5f7..01bfd5c 100644 --- a/healthcare_data/models/generated/with_types/syhdr_commercial_outpatient_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_commercial_outpatient_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - FACILITY_ID::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + FACILITY_ID::UBIGINT, + CLM_CNTL_NUM::NUMERIC, AT_SPCLTY::VARCHAR, - SRVC_BEG_DATE::VARCHAR, - SRVC_END_DATE::VARCHAR, - LOS::VARCHAR, + SRVC_BEG_DATE::DATE, + SRVC_END_DATE::DATE, + LOS::UINTEGER, ADMSN_TYPE::VARCHAR, TOB_CD::VARCHAR, CLM_TYPE_CD::VARCHAR, @@ -101,6 +101,6 @@ SELECT CPT_PRCDR_CD_33::VARCHAR, CPT_PRCDR_CD_34::VARCHAR, CPT_PRCDR_CD_35::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_commercial_outpatient_2016.CSV', header=True, null_padding=true, types={'CPT_PRCDR_CD_1': 'VARCHAR', 'CPT_PRCDR_CD_2': 'VARCHAR', 'CPT_PRCDR_CD_3': 'VARCHAR', 'CPT_PRCDR_CD_4': 'VARCHAR', 'CPT_PRCDR_CD_5': 'VARCHAR', 'CPT_PRCDR_CD_6': 'VARCHAR', 'CPT_PRCDR_CD_7': 'VARCHAR', 'CPT_PRCDR_CD_8': 'VARCHAR', 'CPT_PRCDR_CD_9': 'VARCHAR', 'CPT_PRCDR_CD_10': 'VARCHAR', 'CPT_PRCDR_CD_11': 'VARCHAR', 'CPT_PRCDR_CD_12': 'VARCHAR', 'CPT_PRCDR_CD_13': 'VARCHAR', 'CPT_PRCDR_CD_14': 'VARCHAR', 'CPT_PRCDR_CD_15': 'VARCHAR', 'CPT_PRCDR_CD_16': 'VARCHAR', 'CPT_PRCDR_CD_17': 'VARCHAR', 'CPT_PRCDR_CD_18': 'VARCHAR', 'CPT_PRCDR_CD_19': 'VARCHAR', 'CPT_PRCDR_CD_20': 'VARCHAR', 'CPT_PRCDR_CD_21': 'VARCHAR', 'CPT_PRCDR_CD_22': 'VARCHAR', 'CPT_PRCDR_CD_23': 'VARCHAR', 'CPT_PRCDR_CD_24': 'VARCHAR', 'CPT_PRCDR_CD_25': 'VARCHAR', 'CPT_PRCDR_CD_26': 'VARCHAR', 'CPT_PRCDR_CD_27': 'VARCHAR', 'CPT_PRCDR_CD_28': 'VARCHAR', 'CPT_PRCDR_CD_29': 'VARCHAR', 'CPT_PRCDR_CD_30': 'VARCHAR', 'CPT_PRCDR_CD_31': 'VARCHAR', 'CPT_PRCDR_CD_32': 'VARCHAR', 'CPT_PRCDR_CD_33': 'VARCHAR', 'CPT_PRCDR_CD_34': 'VARCHAR', 'CPT_PRCDR_CD_35': 'VARCHAR'}, ignore_errors=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_commercial_person_2016.sql b/healthcare_data/models/generated/with_types/syhdr_commercial_person_2016.sql index 891cbce..9f99ab0 100644 --- a/healthcare_data/models/generated/with_types/syhdr_commercial_person_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_commercial_person_2016.sql @@ -2,15 +2,15 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - AGE_LOW::VARCHAR, - AGE_HIGH::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + AGE_LOW::NUMERIC, + AGE_HIGH::NUMERIC, SEX_IDENT_CD::VARCHAR, STATE_CD::VARCHAR, COUNTY_FIPS_CD::VARCHAR, ZIP_CD::VARCHAR, - PHRMCY_CVRG_1::VARCHAR, + PHRMCY_CVRG_1::NUMERIC, PHRMCY_CVRG_2::VARCHAR, PHRMCY_CVRG_3::VARCHAR, PHRMCY_CVRG_4::VARCHAR, @@ -21,8 +21,8 @@ SELECT PHRMCY_CVRG_9::VARCHAR, PHRMCY_CVRG_10::VARCHAR, PHRMCY_CVRG_11::VARCHAR, - PHRMCY_CVRG_12::VARCHAR, - CMRCL_INSRC_1::VARCHAR, + PHRMCY_CVRG_12::NUMERIC, + CMRCL_INSRC_1::NUMERIC, CMRCL_INSRC_2::VARCHAR, CMRCL_INSRC_3::VARCHAR, CMRCL_INSRC_4::VARCHAR, @@ -33,5 +33,5 @@ SELECT CMRCL_INSRC_9::VARCHAR, CMRCL_INSRC_10::VARCHAR, CMRCL_INSRC_11::VARCHAR, - CMRCL_INSRC_12::VARCHAR + CMRCL_INSRC_12::NUMERIC FROM read_csv('/Users/me/data/syh_dr/syhdr_commercial_person_2016.CSV', header=True, null_padding=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_commercial_pharmacy_2016.sql b/healthcare_data/models/generated/with_types/syhdr_commercial_pharmacy_2016.sql index 0dc6205..74e7964 100644 --- a/healthcare_data/models/generated/with_types/syhdr_commercial_pharmacy_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_commercial_pharmacy_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - PHMCY_CLM_NUM::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + PHMCY_CLM_NUM::NUMERIC, + CLM_CNTL_NUM::NUMERIC, LINE_NBR::VARCHAR, - FILL_DT::VARCHAR, + FILL_DT::DATE, SYNTHETIC_DRUG_ID::VARCHAR, GENERIC_DRUG_NAME::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_commercial_pharmacy_2016.CSV', header=True, null_padding=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_inpatient_2016.sql b/healthcare_data/models/generated/with_types/syhdr_medicaid_inpatient_2016.sql index 0d55bf9..37bcccc 100644 --- a/healthcare_data/models/generated/with_types/syhdr_medicaid_inpatient_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_medicaid_inpatient_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - FACILITY_ID::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + FACILITY_ID::UBIGINT, + CLM_CNTL_NUM::NUMERIC, AT_SPCLTY::VARCHAR, - SRVC_BEG_DATE::VARCHAR, - SRVC_END_DATE::VARCHAR, - LOS::VARCHAR, + SRVC_BEG_DATE::DATE, + SRVC_END_DATE::DATE, + LOS::UINTEGER, ADMSN_TYPE::VARCHAR, TOB_CD::VARCHAR, CLM_TYPE_CD::VARCHAR, @@ -101,6 +101,6 @@ SELECT CPT_PRCDR_CD_33::VARCHAR, CPT_PRCDR_CD_34::VARCHAR, CPT_PRCDR_CD_35::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_medicaid_inpatient_2016.CSV', header=True, null_padding=true, types={'CPT_PRCDR_CD_1': 'VARCHAR', 'CPT_PRCDR_CD_2': 'VARCHAR', 'CPT_PRCDR_CD_3': 'VARCHAR', 'CPT_PRCDR_CD_4': 'VARCHAR', 'CPT_PRCDR_CD_5': 'VARCHAR', 'CPT_PRCDR_CD_6': 'VARCHAR', 'CPT_PRCDR_CD_7': 'VARCHAR', 'CPT_PRCDR_CD_8': 'VARCHAR', 'CPT_PRCDR_CD_9': 'VARCHAR', 'CPT_PRCDR_CD_10': 'VARCHAR', 'CPT_PRCDR_CD_11': 'VARCHAR', 'CPT_PRCDR_CD_12': 'VARCHAR', 'CPT_PRCDR_CD_13': 'VARCHAR', 'CPT_PRCDR_CD_14': 'VARCHAR', 'CPT_PRCDR_CD_15': 'VARCHAR', 'CPT_PRCDR_CD_16': 'VARCHAR', 'CPT_PRCDR_CD_17': 'VARCHAR', 'CPT_PRCDR_CD_18': 'VARCHAR', 'CPT_PRCDR_CD_19': 'VARCHAR', 'CPT_PRCDR_CD_20': 'VARCHAR', 'CPT_PRCDR_CD_21': 'VARCHAR', 'CPT_PRCDR_CD_22': 'VARCHAR', 'CPT_PRCDR_CD_23': 'VARCHAR', 'CPT_PRCDR_CD_24': 'VARCHAR', 'CPT_PRCDR_CD_25': 'VARCHAR', 'CPT_PRCDR_CD_26': 'VARCHAR', 'CPT_PRCDR_CD_27': 'VARCHAR', 'CPT_PRCDR_CD_28': 'VARCHAR', 'CPT_PRCDR_CD_29': 'VARCHAR', 'CPT_PRCDR_CD_30': 'VARCHAR', 'CPT_PRCDR_CD_31': 'VARCHAR', 'CPT_PRCDR_CD_32': 'VARCHAR', 'CPT_PRCDR_CD_33': 'VARCHAR', 'CPT_PRCDR_CD_34': 'VARCHAR', 'CPT_PRCDR_CD_35': 'VARCHAR'}, ignore_errors=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_outpatient_2016.sql b/healthcare_data/models/generated/with_types/syhdr_medicaid_outpatient_2016.sql index deb4639..5d55b53 100644 --- a/healthcare_data/models/generated/with_types/syhdr_medicaid_outpatient_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_medicaid_outpatient_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - FACILITY_ID::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + FACILITY_ID::UBIGINT, + CLM_CNTL_NUM::NUMERIC, AT_SPCLTY::VARCHAR, - SRVC_BEG_DATE::VARCHAR, - SRVC_END_DATE::VARCHAR, - LOS::VARCHAR, + SRVC_BEG_DATE::DATE, + SRVC_END_DATE::DATE, + LOS::UINTEGER, ADMSN_TYPE::VARCHAR, TOB_CD::VARCHAR, CLM_TYPE_CD::VARCHAR, @@ -101,6 +101,6 @@ SELECT CPT_PRCDR_CD_33::VARCHAR, CPT_PRCDR_CD_34::VARCHAR, CPT_PRCDR_CD_35::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_medicaid_outpatient_2016.CSV', header=True, null_padding=true, types={'CPT_PRCDR_CD_1': 'VARCHAR', 'CPT_PRCDR_CD_2': 'VARCHAR', 'CPT_PRCDR_CD_3': 'VARCHAR', 'CPT_PRCDR_CD_4': 'VARCHAR', 'CPT_PRCDR_CD_5': 'VARCHAR', 'CPT_PRCDR_CD_6': 'VARCHAR', 'CPT_PRCDR_CD_7': 'VARCHAR', 'CPT_PRCDR_CD_8': 'VARCHAR', 'CPT_PRCDR_CD_9': 'VARCHAR', 'CPT_PRCDR_CD_10': 'VARCHAR', 'CPT_PRCDR_CD_11': 'VARCHAR', 'CPT_PRCDR_CD_12': 'VARCHAR', 'CPT_PRCDR_CD_13': 'VARCHAR', 'CPT_PRCDR_CD_14': 'VARCHAR', 'CPT_PRCDR_CD_15': 'VARCHAR', 'CPT_PRCDR_CD_16': 'VARCHAR', 'CPT_PRCDR_CD_17': 'VARCHAR', 'CPT_PRCDR_CD_18': 'VARCHAR', 'CPT_PRCDR_CD_19': 'VARCHAR', 'CPT_PRCDR_CD_20': 'VARCHAR', 'CPT_PRCDR_CD_21': 'VARCHAR', 'CPT_PRCDR_CD_22': 'VARCHAR', 'CPT_PRCDR_CD_23': 'VARCHAR', 'CPT_PRCDR_CD_24': 'VARCHAR', 'CPT_PRCDR_CD_25': 'VARCHAR', 'CPT_PRCDR_CD_26': 'VARCHAR', 'CPT_PRCDR_CD_27': 'VARCHAR', 'CPT_PRCDR_CD_28': 'VARCHAR', 'CPT_PRCDR_CD_29': 'VARCHAR', 'CPT_PRCDR_CD_30': 'VARCHAR', 'CPT_PRCDR_CD_31': 'VARCHAR', 'CPT_PRCDR_CD_32': 'VARCHAR', 'CPT_PRCDR_CD_33': 'VARCHAR', 'CPT_PRCDR_CD_34': 'VARCHAR', 'CPT_PRCDR_CD_35': 'VARCHAR'}, ignore_errors=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_person_2016.sql b/healthcare_data/models/generated/with_types/syhdr_medicaid_person_2016.sql index 6df7fe3..b6ba9f9 100644 --- a/healthcare_data/models/generated/with_types/syhdr_medicaid_person_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_medicaid_person_2016.sql @@ -2,11 +2,11 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - MCAID_BENE_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - AGE_LOW::VARCHAR, - AGE_HIGH::VARCHAR, + PERSON_ID::UBIGINT, + MCAID_BENE_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + AGE_LOW::NUMERIC, + AGE_HIGH::NUMERIC, SEX_IDENT_CD::VARCHAR, RACE_CD::VARCHAR, MCAID_SBMTTG_ST_CD::VARCHAR, @@ -14,7 +14,7 @@ SELECT COUNTY_FIPS_CD::VARCHAR, ZIP_CD::VARCHAR, RSN_ENRLMT_CD::VARCHAR, - MDCD_ENRLMT_1::VARCHAR, + MDCD_ENRLMT_1::NUMERIC, MDCD_ENRLMT_2::VARCHAR, MDCD_ENRLMT_3::VARCHAR, MDCD_ENRLMT_4::VARCHAR, @@ -25,8 +25,8 @@ SELECT MDCD_ENRLMT_9::VARCHAR, MDCD_ENRLMT_10::VARCHAR, MDCD_ENRLMT_11::VARCHAR, - MDCD_ENRLMT_12::VARCHAR, - MDCD_MCO_ENRLMT_1::VARCHAR, + MDCD_ENRLMT_12::NUMERIC, + MDCD_MCO_ENRLMT_1::NUMERIC, MDCD_MCO_ENRLMT_2::VARCHAR, MDCD_MCO_ENRLMT_3::VARCHAR, MDCD_MCO_ENRLMT_4::VARCHAR, @@ -37,10 +37,10 @@ SELECT MDCD_MCO_ENRLMT_9::VARCHAR, MDCD_MCO_ENRLMT_10::VARCHAR, MDCD_MCO_ENRLMT_11::VARCHAR, - MDCD_MCO_ENRLMT_12::VARCHAR, - MDCD_CHIP_ENRLMT::VARCHAR, + MDCD_MCO_ENRLMT_12::NUMERIC, + MDCD_CHIP_ENRLMT::NUMERIC, RSTRCTD_BNFTS_IND::VARCHAR, - DUAL_ELGBL_1::VARCHAR, + DUAL_ELGBL_1::NUMERIC, DUAL_ELGBL_2::VARCHAR, DUAL_ELGBL_3::VARCHAR, DUAL_ELGBL_4::VARCHAR, @@ -51,5 +51,5 @@ SELECT DUAL_ELGBL_9::VARCHAR, DUAL_ELGBL_10::VARCHAR, DUAL_ELGBL_11::VARCHAR, - DUAL_ELGBL_12::VARCHAR + DUAL_ELGBL_12::NUMERIC FROM read_csv('/Users/me/data/syh_dr/syhdr_medicaid_person_2016.CSV', header=True, null_padding=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_medicaid_pharmacy_2016.sql b/healthcare_data/models/generated/with_types/syhdr_medicaid_pharmacy_2016.sql index 53be3b9..0f70e38 100644 --- a/healthcare_data/models/generated/with_types/syhdr_medicaid_pharmacy_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_medicaid_pharmacy_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - PHMCY_CLM_NUM::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + PHMCY_CLM_NUM::NUMERIC, + CLM_CNTL_NUM::NUMERIC, LINE_NBR::VARCHAR, - FILL_DT::VARCHAR, + FILL_DT::DATE, SYNTHETIC_DRUG_ID::VARCHAR, GENERIC_DRUG_NAME::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_medicaid_pharmacy_2016.CSV', header=True, null_padding=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_inpatient_2016.sql b/healthcare_data/models/generated/with_types/syhdr_medicare_inpatient_2016.sql index 782c3ac..524a707 100644 --- a/healthcare_data/models/generated/with_types/syhdr_medicare_inpatient_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_medicare_inpatient_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - FACILITY_ID::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + FACILITY_ID::UBIGINT, + CLM_CNTL_NUM::NUMERIC, AT_SPCLTY::VARCHAR, - SRVC_BEG_DATE::VARCHAR, - SRVC_END_DATE::VARCHAR, - LOS::VARCHAR, + SRVC_BEG_DATE::DATE, + SRVC_END_DATE::DATE, + LOS::UINTEGER, ADMSN_TYPE::VARCHAR, TOB_CD::VARCHAR, CLM_TYPE_CD::VARCHAR, @@ -101,6 +101,6 @@ SELECT CPT_PRCDR_CD_33::VARCHAR, CPT_PRCDR_CD_34::VARCHAR, CPT_PRCDR_CD_35::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_medicare_inpatient_2016.CSV', header=True, null_padding=true, types={'CPT_PRCDR_CD_1': 'VARCHAR', 'CPT_PRCDR_CD_2': 'VARCHAR', 'CPT_PRCDR_CD_3': 'VARCHAR', 'CPT_PRCDR_CD_4': 'VARCHAR', 'CPT_PRCDR_CD_5': 'VARCHAR', 'CPT_PRCDR_CD_6': 'VARCHAR', 'CPT_PRCDR_CD_7': 'VARCHAR', 'CPT_PRCDR_CD_8': 'VARCHAR', 'CPT_PRCDR_CD_9': 'VARCHAR', 'CPT_PRCDR_CD_10': 'VARCHAR', 'CPT_PRCDR_CD_11': 'VARCHAR', 'CPT_PRCDR_CD_12': 'VARCHAR', 'CPT_PRCDR_CD_13': 'VARCHAR', 'CPT_PRCDR_CD_14': 'VARCHAR', 'CPT_PRCDR_CD_15': 'VARCHAR', 'CPT_PRCDR_CD_16': 'VARCHAR', 'CPT_PRCDR_CD_17': 'VARCHAR', 'CPT_PRCDR_CD_18': 'VARCHAR', 'CPT_PRCDR_CD_19': 'VARCHAR', 'CPT_PRCDR_CD_20': 'VARCHAR', 'CPT_PRCDR_CD_21': 'VARCHAR', 'CPT_PRCDR_CD_22': 'VARCHAR', 'CPT_PRCDR_CD_23': 'VARCHAR', 'CPT_PRCDR_CD_24': 'VARCHAR', 'CPT_PRCDR_CD_25': 'VARCHAR', 'CPT_PRCDR_CD_26': 'VARCHAR', 'CPT_PRCDR_CD_27': 'VARCHAR', 'CPT_PRCDR_CD_28': 'VARCHAR', 'CPT_PRCDR_CD_29': 'VARCHAR', 'CPT_PRCDR_CD_30': 'VARCHAR', 'CPT_PRCDR_CD_31': 'VARCHAR', 'CPT_PRCDR_CD_32': 'VARCHAR', 'CPT_PRCDR_CD_33': 'VARCHAR', 'CPT_PRCDR_CD_34': 'VARCHAR', 'CPT_PRCDR_CD_35': 'VARCHAR'}, ignore_errors=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_outpatient_2016.sql b/healthcare_data/models/generated/with_types/syhdr_medicare_outpatient_2016.sql index bd5160d..eff0173 100644 --- a/healthcare_data/models/generated/with_types/syhdr_medicare_outpatient_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_medicare_outpatient_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - FACILITY_ID::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + FACILITY_ID::UBIGINT, + CLM_CNTL_NUM::NUMERIC, AT_SPCLTY::VARCHAR, - SRVC_BEG_DATE::VARCHAR, - SRVC_END_DATE::VARCHAR, - LOS::VARCHAR, + SRVC_BEG_DATE::DATE, + SRVC_END_DATE::DATE, + LOS::UINTEGER, ADMSN_TYPE::VARCHAR, TOB_CD::VARCHAR, CLM_TYPE_CD::VARCHAR, @@ -101,6 +101,6 @@ SELECT CPT_PRCDR_CD_33::VARCHAR, CPT_PRCDR_CD_34::VARCHAR, CPT_PRCDR_CD_35::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_medicare_outpatient_2016.CSV', header=True, null_padding=true, types={'CPT_PRCDR_CD_1': 'VARCHAR', 'CPT_PRCDR_CD_2': 'VARCHAR', 'CPT_PRCDR_CD_3': 'VARCHAR', 'CPT_PRCDR_CD_4': 'VARCHAR', 'CPT_PRCDR_CD_5': 'VARCHAR', 'CPT_PRCDR_CD_6': 'VARCHAR', 'CPT_PRCDR_CD_7': 'VARCHAR', 'CPT_PRCDR_CD_8': 'VARCHAR', 'CPT_PRCDR_CD_9': 'VARCHAR', 'CPT_PRCDR_CD_10': 'VARCHAR', 'CPT_PRCDR_CD_11': 'VARCHAR', 'CPT_PRCDR_CD_12': 'VARCHAR', 'CPT_PRCDR_CD_13': 'VARCHAR', 'CPT_PRCDR_CD_14': 'VARCHAR', 'CPT_PRCDR_CD_15': 'VARCHAR', 'CPT_PRCDR_CD_16': 'VARCHAR', 'CPT_PRCDR_CD_17': 'VARCHAR', 'CPT_PRCDR_CD_18': 'VARCHAR', 'CPT_PRCDR_CD_19': 'VARCHAR', 'CPT_PRCDR_CD_20': 'VARCHAR', 'CPT_PRCDR_CD_21': 'VARCHAR', 'CPT_PRCDR_CD_22': 'VARCHAR', 'CPT_PRCDR_CD_23': 'VARCHAR', 'CPT_PRCDR_CD_24': 'VARCHAR', 'CPT_PRCDR_CD_25': 'VARCHAR', 'CPT_PRCDR_CD_26': 'VARCHAR', 'CPT_PRCDR_CD_27': 'VARCHAR', 'CPT_PRCDR_CD_28': 'VARCHAR', 'CPT_PRCDR_CD_29': 'VARCHAR', 'CPT_PRCDR_CD_30': 'VARCHAR', 'CPT_PRCDR_CD_31': 'VARCHAR', 'CPT_PRCDR_CD_32': 'VARCHAR', 'CPT_PRCDR_CD_33': 'VARCHAR', 'CPT_PRCDR_CD_34': 'VARCHAR', 'CPT_PRCDR_CD_35': 'VARCHAR'}, ignore_errors=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_person_2016.sql b/healthcare_data/models/generated/with_types/syhdr_medicare_person_2016.sql index c5a3b6e..5076a15 100644 --- a/healthcare_data/models/generated/with_types/syhdr_medicare_person_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_medicare_person_2016.sql @@ -2,10 +2,10 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - AGE_LOW::VARCHAR, - AGE_HIGH::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + AGE_LOW::NUMERIC, + AGE_HIGH::NUMERIC, SEX_IDENT_CD::VARCHAR, RACE_CD::VARCHAR, STATE_CD::VARCHAR, @@ -24,7 +24,7 @@ SELECT MDCR_ENTLMT_IND_10::VARCHAR, MDCR_ENTLMT_IND_11::VARCHAR, MDCR_ENTLMT_IND_12::VARCHAR, - MDCR_HMO_CVRG_1::VARCHAR, + MDCR_HMO_CVRG_1::NUMERIC, MDCR_HMO_CVRG_2::VARCHAR, MDCR_HMO_CVRG_3::VARCHAR, MDCR_HMO_CVRG_4::VARCHAR, @@ -35,8 +35,8 @@ SELECT MDCR_HMO_CVRG_9::VARCHAR, MDCR_HMO_CVRG_10::VARCHAR, MDCR_HMO_CVRG_11::VARCHAR, - MDCR_HMO_CVRG_12::VARCHAR, - PHRMCY_CVRG_1::VARCHAR, + MDCR_HMO_CVRG_12::NUMERIC, + PHRMCY_CVRG_1::NUMERIC, PHRMCY_CVRG_2::VARCHAR, PHRMCY_CVRG_3::VARCHAR, PHRMCY_CVRG_4::VARCHAR, @@ -47,8 +47,8 @@ SELECT PHRMCY_CVRG_9::VARCHAR, PHRMCY_CVRG_10::VARCHAR, PHRMCY_CVRG_11::VARCHAR, - PHRMCY_CVRG_12::VARCHAR, - DUAL_ELGBL_1::VARCHAR, + PHRMCY_CVRG_12::NUMERIC, + DUAL_ELGBL_1::NUMERIC, DUAL_ELGBL_2::VARCHAR, DUAL_ELGBL_3::VARCHAR, DUAL_ELGBL_4::VARCHAR, @@ -59,5 +59,5 @@ SELECT DUAL_ELGBL_9::VARCHAR, DUAL_ELGBL_10::VARCHAR, DUAL_ELGBL_11::VARCHAR, - DUAL_ELGBL_12::VARCHAR + DUAL_ELGBL_12::NUMERIC FROM read_csv('/Users/me/data/syh_dr/syhdr_medicare_person_2016.CSV', header=True, null_padding=true) \ No newline at end of file diff --git a/healthcare_data/models/generated/with_types/syhdr_medicare_pharmacy_2016.sql b/healthcare_data/models/generated/with_types/syhdr_medicare_pharmacy_2016.sql index 2f18b05..1070461 100644 --- a/healthcare_data/models/generated/with_types/syhdr_medicare_pharmacy_2016.sql +++ b/healthcare_data/models/generated/with_types/syhdr_medicare_pharmacy_2016.sql @@ -2,14 +2,14 @@ {{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }} SELECT - PERSON_ID::VARCHAR, - PERSON_WGHT::VARCHAR, - PHMCY_CLM_NUM::VARCHAR, - CLM_CNTL_NUM::VARCHAR, + PERSON_ID::UBIGINT, + PERSON_WGHT::NUMERIC, + PHMCY_CLM_NUM::NUMERIC, + CLM_CNTL_NUM::NUMERIC, LINE_NBR::VARCHAR, - FILL_DT::VARCHAR, + FILL_DT::DATE, SYNTHETIC_DRUG_ID::VARCHAR, GENERIC_DRUG_NAME::VARCHAR, - PLAN_PMT_AMT::VARCHAR, - TOT_CHRG_AMT::VARCHAR + replace(replace(PLAN_PMT_AMT, '$', ''), ',', '')::FLOAT, + replace(replace(TOT_CHRG_AMT, '$', ''), ',', '')::FLOAT FROM read_csv('/Users/me/data/syh_dr/syhdr_medicare_pharmacy_2016.CSV', header=True, null_padding=true) \ No newline at end of file diff --git a/healthcare_data/scripts/generate_syh_dr_data_models.py b/healthcare_data/scripts/generate_syh_dr_data_models.py index dcda078..55558d8 100644 --- a/healthcare_data/scripts/generate_syh_dr_data_models.py +++ b/healthcare_data/scripts/generate_syh_dr_data_models.py @@ -5,6 +5,7 @@ import tempfile import pdfplumber + def process_csv_files(pdf_url, csv_folder): # Download the data dictionary PDF response = requests.get(pdf_url) @@ -30,7 +31,7 @@ def process_csv_files(pdf_url, csv_folder): "syhdr_medicare_outpatient_2016.CSV", "syhdr_medicare_person_2016.CSV", "syhdr_medicare_pharmacy_2016.CSV", - "syhdr_medicare_provider_2016.csv" + "syhdr_medicare_provider_2016.csv", ] # Process each CSV file @@ -58,7 +59,8 @@ def process_csv_files(pdf_url, csv_folder): csv_types = None for column_name in column_names: print(f"Processing column: {column_name}") - + data_type = None + # Manually set the data type for some columns if "_DT" in column_name or "_DATE" in column_name: data_type = "DATE" elif "_ST_CD" in column_name: @@ -72,16 +74,16 @@ def process_csv_files(pdf_url, csv_folder): elif "_ID" in column_name and "CD" not in column_name: data_type = "UBIGINT" elif "LOS" in column_name: - data_type = "UINT" - - if data_type: - print(f"Data type for column '{column_name}': {data_type}") - continue + data_type = "UINTEGER" + elif "AMT" in column_name: + data_type = "FLOAT" for page_number in range(10, len(pdf.pages)): print(f"Searching for column '{column_name}' on page {page_number + 1}") page = pdf.pages[page_number] - cropped_page = page.crop((72, 86.4, page.width - 72, page.height - 70.0)) + cropped_page = page.crop( + (72, 86.4, page.width - 72, page.height - 70.0) + ) text = cropped_page.extract_text() lines = text.split("\n") @@ -89,12 +91,12 @@ def process_csv_files(pdf_url, csv_folder): print(f"Column '{column_name}' found on page {page_number + 1}") # Extract the first occurrence of "Character" or "Numeric" before the table start - data_type = None - if "Character" in lines: - data_type = "VARCHAR" - elif "Numeric" in lines: - data_type = "NUMERIC" - print(f"Data type for column '{column_name}': {data_type}") + if data_type is None: + if "Character" in lines: + data_type = "VARCHAR" + elif "Numeric" in lines: + data_type = "NUMERIC" + print(f"Data type for column '{column_name}': {data_type}") if data_type: column_definitions[column_name] = data_type @@ -107,7 +109,9 @@ def process_csv_files(pdf_url, csv_folder): with open(os.path.join("models/generated/with_types", sql_file), "w") as f: # Write the SQL model header f.write(f"-- SQL model for {csv_file}\n") - f.write("{{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }}\n\n") + f.write( + "{{ config(materialized='external', location=var('output_path') + '/' + this.name + '.parquet') }}\n\n" + ) # Write the SQL SELECT statement column_list = [] @@ -115,8 +119,10 @@ def process_csv_files(pdf_url, csv_folder): if column_name in column_definitions: data_type = column_definitions[column_name] if "AMT" in column_name: - # remove dollar sign from the VARCHAR string - column_list.append(f"replace(replace({column_name}, '$', ''), ',', '')::{data_type}") + # remove dollar sign from the VARCHAR string + column_list.append( + f"replace(replace({column_name}, '$', ''), ',', '')::{data_type}" + ) else: column_list.append(f"{column_name}::{data_type}") else: @@ -136,13 +142,17 @@ def process_csv_files(pdf_url, csv_folder): # Clean up the temporary PDF file os.unlink(pdf_path) + def main(csv_folder, data_dictionary_url): process_csv_files(data_dictionary_url, csv_folder) + if __name__ == "__main__": if len(sys.argv) != 3: - print("Usage: python generate_syh_dr_sql_models.py ") + print( + "Usage: python generate_syh_dr_sql_models.py " + ) sys.exit(1) csv_folder = sys.argv[1] data_dictionary_url = sys.argv[2] - main(csv_folder, data_dictionary_url) \ No newline at end of file + main(csv_folder, data_dictionary_url)