Skip to content

Commit

Permalink
formatting tweaks post-linter tweak, adding new growth data files, refs
Browse files Browse the repository at this point in the history
  • Loading branch information
dchud committed Feb 26, 2023
1 parent 96c6990 commit d71671a
Show file tree
Hide file tree
Showing 9 changed files with 81 additions and 21,261 deletions.
Binary file removed growthviz-data/.DS_Store
Binary file not shown.
Binary file added growthviz-data/ext/growthfile_cdc_ext.csv.gz
Binary file not shown.
Binary file added growthviz-data/ext/growthfile_who.csv.gz
Binary file not shown.
21,195 changes: 0 additions & 21,195 deletions growthviz-data/ext/swaps.csv

This file was deleted.

64 changes: 33 additions & 31 deletions growthviz/charts.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,19 +45,20 @@ def weight_distr(df, mode):
Create charts with overall and outlier weight distributions (included values only)
Parameters:
df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat, include,
category, colors, patterns, and sort_order columns
mode: (str) indicates how many of the weights you want to use. If set to 'high', the function
will only use weights above a certain threshold. Otherwise, it displays all the weights.
df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, and
include columns
mode: (str) indicates how many of the weights you want to use. If set to 'high',
the function will only use weights above a certain threshold. Otherwise, it
displays all the weights.
"""
wgt_grp = df[(df["param"] == "WEIGHTKG") & (df["include"] == True)]
wgt_grp = df[(df["param"] == "WEIGHTKG") & (df["include"] is True)]
if mode == "high":
wgt_grp = wgt_grp.loc[wgt_grp["measurement"] >= 135]
plt.title("Weights At or Above 135kg")
else:
plt.title("All Weights")
if len(wgt_grp.index) == 0:
print("No included observations with weight (kg) >= 135")
print("No included observations with weight (kg) >= 135.")
plt.close()
else:
round_col = wgt_grp.apply(
Expand All @@ -66,7 +67,7 @@ def weight_distr(df, mode):
wgt_grp = wgt_grp.assign(round_weight=round_col.values)
wgt_grp_sum = wgt_grp.groupby("round_weight")["subjid"].count().reset_index()
plt.rcParams["figure.figsize"] = [7, 5]
wgt_grp_sum_plot = plt.bar(wgt_grp_sum["round_weight"], wgt_grp_sum["subjid"])
plt.bar(wgt_grp_sum["round_weight"], wgt_grp_sum["subjid"])
# Assure there is some breadth to the x-axis in case of just a few observations
if wgt_grp["measurement"].max() - wgt_grp["measurement"].min() < 10:
plt.xlim(wgt_grp["measurement"].min() - 5, wgt_grp["measurement"].max() + 5)
Expand All @@ -78,11 +79,12 @@ def weight_distr(df, mode):

def make_age_charts(df, mode):
"""
Creates a chart with the age ranges in the dataset. Counts the number of subjids in each range.
Creates a chart with the age ranges in the dataset. Counts the number of subjids in
each range.
Parameters:
df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat, include,
category, colors, patterns, and sort_order columns
df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat,
include, category, colors, patterns, and sort_order columns
mode: (str) indicates whether you want the adults or pediatrics values.
"""
obs_grp = df
Expand All @@ -94,8 +96,8 @@ def make_age_charts(df, mode):
else:
raise Exception("Valid modes are 'adults' and 'pediatrics'")

# Adds label, color, pattern and sort order columns to the dataframe based on the age of each
# row in the dataframe
# Adds label, color, pattern and sort order columns to the dataframe based on the
# age of each row in the dataframe
def add_categories_to_frame(df_data, df_reference):
categories = []
colors = []
Expand All @@ -119,8 +121,8 @@ def add_categories_to_frame(df_data, df_reference):
# Call the categorizing function on the data
obs_grp = add_categories_to_frame(obs_grp, label_frame)

# Groups the new dataframe by category, sort order, colors and patterns. It then counts the
# number of subject ids in each group and sorts the values by sort order.
# Groups the new dataframe by category, sort order, colors and patterns. It then
# counts the number of subject ids in each group and sorts the values by sort order.
obs_grp = (
obs_grp.groupby(["category", "sort_order", "colors", "patterns"])["subjid"]
.count()
Expand Down Expand Up @@ -214,14 +216,14 @@ def overlap_view_adults(
xmin = math.floor(individual.age.min())
xmax = math.ceil(individual.age.max())
selected_param_plot.set_xlim(xmin, xmax)
if include_carry_forward == True:
if include_carry_forward is True:
carry_forward = selected_param[
selected_param.clean_value == "Exclude-Carried-Forward"
]
selected_param_plot.scatter(
x=carry_forward.age, y=carry_forward.measurement, c="c", marker="^"
)
if include_percentiles == True:
if include_percentiles is True:
if param == "WEIGHTKG":
percentile_df = wt_df
elif param == "BMI":
Expand Down Expand Up @@ -287,7 +289,7 @@ def overlap_view_adults_show(
"""
Wraps overlap_view_adult with plt.show().
"""
plot = overlap_view_adults(
overlap_view_adults(
obs_df,
subjid,
param,
Expand Down Expand Up @@ -345,14 +347,14 @@ def overlap_view_pediatrics(
c="r",
marker="x",
)
if include_carry_forward == True:
if include_carry_forward is True:
carry_forward = selected_param[
selected_param.clean_value == "Exclude-Carried-Forward"
]
selected_param_plot.scatter(
x=carry_forward.age, y=carry_forward.measurement, c="c", marker="^"
)
if include_percentiles == True:
if include_percentiles is True:
percentile_df = wt_df if param == "WEIGHTKG" else ht_df
percentile_window = percentile_df.loc[
(percentile_df.Sex == individual.sex.min())
Expand All @@ -374,7 +376,7 @@ def overlap_view_pediatrics_show(
"""
Wraps overlap_view_pediatrics with plt.show().
"""
plot = overlap_view_pediatrics(
overlap_view_pediatrics(
obs_df, subjid, param, include_carry_forward, include_percentiles, wt_df, ht_df
)
plt.show()
Expand Down Expand Up @@ -442,7 +444,7 @@ def overlap_view_double_pediatrics(
ax2.set_ylabel(
"weight (kg)", color=color_secondary
) # we already handled the x-label with ax1
if include_percentiles == True:
if include_percentiles is True:
percentile_window = wt_df.loc[wt_df.Sex == individual.sex.min()]
ax2.plot(percentile_window.age, percentile_window.P5, color="lightblue")
ax2.plot(
Expand Down Expand Up @@ -476,21 +478,21 @@ def overlap_view_double_pediatrics(
)
ax1.plot(percentile_window_ht.age, percentile_window_ht.P95, color="pink")

if show_all_measurements == True:
if show_all_measurements is True:
ax1.plot(height["age"], height["measurement"], color=color, label="stature")
ax2.plot(
weight["age"], weight["measurement"], color=color_secondary, label="weight"
)

if show_excluded_values == True:
if show_excluded_values is True:
ax1.scatter(
excluded_height.age, excluded_height.measurement, c="black", marker="x"
)
ax2.scatter(
excluded_weight.age, excluded_weight.measurement, c="black", marker="x"
)

if show_trajectory_with_exclusions == True:
if show_trajectory_with_exclusions is True:
ax1.plot(
included_height["age"],
included_height["measurement"],
Expand All @@ -509,7 +511,7 @@ def overlap_view_double_pediatrics(

fig.tight_layout() # otherwise the right y-label is slightly clipped

if include_carry_forward == True:
if include_carry_forward is True:
carry_forward_height = height[height.clean_value == "Exclude-Carried-Forward"]
carry_forward_weight = weight[weight.clean_value == "Exclude-Carried-Forward"]
ax1.scatter(
Expand Down Expand Up @@ -594,7 +596,7 @@ def five_by_five_view(obs_df, subjids, param, wt_df, ht_df, bmi_df, linestyle):
for x in range(nrows):
try:
subjid = subjids[x * 5 + y]
except IndexError as ie:
except IndexError:
# No more subjects to render
break
individual = obs_df[obs_df.subjid == subjid]
Expand Down Expand Up @@ -758,15 +760,15 @@ def top_ten(
in the notebook.
"""
working_set = merged_df
if age != None:
if age is not None:
working_set = working_set.loc[
working_set.rounded_age.ge(age[0]) & working_set.rounded_age.le(age[1])
]
if sex != None:
if sex is not None:
working_set = working_set[working_set.sex == sex]
if wexclusion != None:
if wexclusion is not None:
working_set = working_set[working_set.weight_cat.isin(wexclusion)]
if hexclusion != None:
if hexclusion is not None:
working_set = working_set[working_set.height_cat.isin(hexclusion)]
# if order == 'largest':
# working_set = working_set.nlargest(10, field)
Expand Down Expand Up @@ -796,7 +798,7 @@ def top_ten(
"BMIz",
]
]
if out == None:
if out is None:
return working_set
else:
out.clear_output()
Expand Down
2 changes: 1 addition & 1 deletion growthviz/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def subject_stats_comparison(combined_df):
for rn in combined_df.run_name.unique():
total_subjects = combined_df[combined_df.run_name == rn].subjid.nunique()
only_exclusions = combined_df[
(combined_df.run_name == rn) & (combined_df.include == False)
(combined_df.run_name == rn) & (combined_df.include is False)
]
percent_with_exclusion = (
only_exclusions.subjid.nunique() / total_subjects
Expand Down
76 changes: 44 additions & 32 deletions growthviz/processdata.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
from IPython.display import FileLinks
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import FileLinks


def setup_individual_obs_df(obs_df):
"""
Standardizes adults and pediatrics files for clean processing in GrowthViz notebooks
Parameters:
obs_df: (DataFrame) with subjid, sex, age, measurement, param and clean_value columns
obs_df: (DataFrame) with subjid, sex, age, measurement, param and clean_value
columns
Returns:
DataFrame with updated columns
Expand Down Expand Up @@ -135,16 +134,23 @@ def setup_percentiles_pediatrics(percentiles_file):

def keep_age_range(df, mode):
"""
Returns specified age range
Returns specified age range, removing extraneous columns as well
Parameters:
df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat, include,
category, colors, patterns, and sort_order columns
mode: (str) indicates whether you want the "adults" (18-80) or "pediatrics" (0-25) values
df: (DataFrame) with subjid, param, measurement, age, sex, clean_value, and
include columns
mode: (str) indicates whether you want the "adults" (18-80) or "pediatrics" (0-25)
values
Returns:
DataFrame with filtered ages, unchanged if invalid mode is specified
"""
# Note: this is a side effect; just the simplest place to remove these
cols_to_drop = []
for extra_col in ["clean_cat", "category", "colors", "patterns", "sort_order"]:
if extra_col in df.columns:
cols_to_drop.append(extra_col)
df = df.drop(columns=cols_to_drop)
if mode == "adults":
return df[df["age"].between(18, 80, inclusive="both")]
elif mode == "pediatrics":
Expand All @@ -158,7 +164,8 @@ def setup_merged_df(obs_df):
Merges together weight and height data for calculating BMI
Parameters:
obs_df: (DataFrame) with subjid, sex, age, measurement, param and clean_value columns
obs_df: (DataFrame) with subjid, sex, age, measurement, param and clean_value
columns
Returns:
DataFrame with merged data
Expand Down Expand Up @@ -206,7 +213,8 @@ def setup_merged_df(obs_df):

def exclusion_information(obs):
"""
Provides a count and percentage of growthcleanr categories by measurement type (param).
Provides a count and percentage of growthcleanr categories by measurement type
(param).
Parameters:
obs: a DataFrame, in the format output by setup_individual_obs_df
Expand Down Expand Up @@ -238,15 +246,16 @@ def exclusion_information(obs):

def label_incl(row):
"""
Categorizes BMI calculations as Include, Implausible, or unable to calculate (Only Wt or Ht)
Categorizes BMI calculations as Include, Implausible, or unable to calculate (Only
Wt or Ht)
Parameters:
row: (Series) dataframe row
Returns:
Category (str) for BMI calculation
"""
if row["include_both"] == True:
if row["include_both"] is True:
return "Include"
elif (row["weight_cat"] == "Implausible") | (row["height_cat"] == "Implausible"):
return "Implausible"
Expand All @@ -261,8 +270,8 @@ def setup_bmi_adults(merged_df, obs):
Parameters:
merged_df: (DataFrame) with subjid, bmi, include_height, include_weight, rounded_age
and sex columns
obs: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat, include,
category, colors, patterns, and sort_order columns
obs: (DataFrame) with subjid, param, measurement, age, sex, clean_value, clean_cat,
include, category, colors, patterns, and sort_order columns
Returns:
DataFrame with appended values
Expand Down Expand Up @@ -324,15 +333,17 @@ def export_to_csv(da_locals, selection_widget, out):

def clean_swapped_values(merged_df):
"""
This function will look in a DataFrame for rows where the height_cat and weight_cat are set to
"Swapped-Measurements" (or the adult equivalent). It will then swap the height and weight values
for those rows, and recalculate BMIs based on these changes. It will also create two new columns:
postprocess_height_cat and postprocess_weight_cat. The values for these columns are copied from
the original categories except in the case where swaps are fixed when it is set to
This function will look in a DataFrame for rows where the height_cat and weight_cat
are set to "Swapped-Measurements" (or the adult equivalent). It will then swap the
height and weight values for those rows, and recalculate BMIs based on these
changes. It will also create two new columns: postprocess_height_cat and
postprocess_weight_cat. The values for these columns are copied from the original
categories except in the case where swaps are fixed when it is set to
"Include-Fixed-Swap".
Parameters:
merged_df: (DataFrame) with subjid, height, weight, include_height and include_weight columns
merged_df: (DataFrame) with subjid, height, weight, include_height and
include_weight columns
Returns:
The cleaned DataFrame
Expand Down Expand Up @@ -368,20 +379,21 @@ def clean_swapped_values(merged_df):

def clean_unit_errors(merged_df):
"""
This function will look in a DataFrame for rows where the height_cat and weight_cat are set to
"Unit-Error-High" or "Unit-Error-Low". It will then multiply / divide the height and weight
values to convert them. It will also create two new columns: postprocess_height_cat and
postprocess_weight_cat. The values for these columns are copied from the original categories
except in the case where unit errors are fixed when it is set to "Include-UH" or "Include-UL"
respectively.
At present, the adult algorithm does not specify high or low unit errors, rather it only flags
"Exclude-Adult-Unit-Errors", so this function only works with pediatrics data. If growthcleanr
adds high and low designations for adult unit errors, a comparable set of conditions could be
added here to accommodate adult data.
This function will look in a DataFrame for rows where the height_cat and weight_cat
are set to "Unit-Error-High" or "Unit-Error-Low". It will then multiply / divide
the height and weight values to convert them. It will also create two new columns:
postprocess_height_cat and postprocess_weight_cat. The values for these columns
are copied from the original categories except in the case where unit errors are
fixed when it is set to "Include-UH" or "Include-UL" respectively.
At present, the adult algorithm does not specify high or low unit errors, rather it
only flags "Exclude-Adult-Unit-Errors", so this function only works with pediatrics
data. If growthcleanr adds high and low designations for adult unit errors, a
comparable set of conditions could be added here to accommodate adult data.
Parameters:
merged_df: (DataFrame) with subjid, height, weight, include_height and include_weight columns
merged_df: (DataFrame) with subjid, height, weight, include_height and
include_weight columns
Returns:
The cleaned DataFrame
Expand Down
4 changes: 2 additions & 2 deletions growthviz/sumstats.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from IPython.display import Markdown
import numpy as np
from IPython.display import Markdown


def setup_percentile_zscore_adults(percentiles_clean):
Expand Down Expand Up @@ -169,7 +169,7 @@ def bmi_stats(
merged_stats = merged_stats.rename(
columns={"std_raw": "sd_raw", "std_clean": "sd_clean"}
)
if out == None:
if out is None:
return merged_stats
else:
# Clear output on first update and all subsequent updates, see
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
ipywidgets~=7.0
jupyter-server<2.0.0
matplotlib>=3.3.4
pandas>=1.2.2
qgrid>=1.3.1
Expand Down

0 comments on commit d71671a

Please sign in to comment.