Skip to content

Commit

Permalink
Merge pull request #50 from mitre/t37-update-sample-data
Browse files Browse the repository at this point in the history
Update sample data, closes #37, #51, #52
  • Loading branch information
dchud authored Jul 2, 2021
2 parents 481187d + dba5b93 commit 90f4681
Show file tree
Hide file tree
Showing 13 changed files with 81,300 additions and 87,825 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,4 @@ cython_debug/
*.swo
*.swp
.DS_Store
output
1,887 changes: 525 additions & 1,362 deletions GrowthViz-adults.ipynb

Large diffs are not rendered by default.

65 changes: 28 additions & 37 deletions GrowthViz-adults.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,17 +263,17 @@
'BMIz': { 'width': 30 },
}
g = qgrid.show_grid(charts.top_ten(mdf, 'weight'), precision=3, column_options=col_opt, column_definitions=col_def)
out = widgets.Output()
ind_out = widgets.Output()
def handle_selection_change(_event, _widget):
sdf = g.get_selected_df()
out.clear_output()
ind_out.clear_output()
if sdf.shape[0] >= 1:
subjid = sdf.subjid.iloc[0]
with out:
with ind_out:
charts.overlap_view_adults(obs, subjid, 'WEIGHTKG', True, True, wt_percentiles, bmi_percentiles, ht_percentiles)
display(plt.show())
g.on('selection_changed', handle_selection_change)
widgets.VBox([g, out])
widgets.VBox([g, ind_out])


# It can be useful to copy values from the `subjid` column in the results above for use in visualizations in the rest of the tool.
Expand All @@ -290,8 +290,8 @@ def handle_selection_change(_event, _widget):


all_ids = obs['subjid'].unique()
val = 'd88d3987-93ff-0820-286f-754cd971012d' if 'd88d3987-93ff-0820-286f-754cd971012d' in all_ids else np.random.choice(all_ids, size=1, replace=False) # another good id: 25477664
interactive(charts.overlap_view_adults, obs_df=fixed(obs_wbmi),
val = 2868 if 2868 in all_ids else np.random.choice(all_ids, size=1, replace=False)
interactive(charts.overlap_view_adults_show, obs_df=fixed(obs_wbmi),
subjid=widgets.Dropdown(options=all_ids, value=val, description='Subject ID:', disabled=False),
param=['HEIGHTCM', 'WEIGHTKG', 'BMI'],
include_carry_forward=widgets.Checkbox(value=True,description='Include Carry Forward',disabled=False,indent=False),
Expand All @@ -302,21 +302,23 @@ def handle_selection_change(_event, _widget):
# In[22]:


obs_wbmi[obs_wbmi['subjid'] == 'd88d3987-93ff-0820-286f-754cd971012d'] # b5a84a9d-dd7c-95cb-5fd9-3c581a72c812, 867a461b-7cb8-76aa-9891-42369a9899e8 is an example with the underweight line
obs_wbmi[obs_wbmi['subjid'] == 2868]


# In[23]:


# display all charts at the same time
charts.overlap_view_adults(obs_df=obs_wbmi, subjid=val, param='WEIGHTKG', include_carry_forward=True,
@interact(subjid=widgets.Dropdown(options=all_ids, value=val, description='Subject ID:', disabled=False))
def all_charts(subjid=val):
charts.overlap_view_adults(obs_df=obs_wbmi, subjid=subjid, param='WEIGHTKG', include_carry_forward=True,
include_percentiles=True, wt_df=wt_percentiles, bmi_df=bmi_percentiles, ht_df=ht_percentiles)

charts.overlap_view_adults(obs_df=obs_wbmi, subjid=val, param='BMI', include_carry_forward=True,
charts.overlap_view_adults(obs_df=obs_wbmi, subjid=subjid, param='BMI', include_carry_forward=True,
include_percentiles=True, wt_df=wt_percentiles, bmi_df=bmi_percentiles, ht_df=ht_percentiles)

charts.overlap_view_adults(obs_df=obs_wbmi, subjid=val, param='HEIGHTCM', include_carry_forward=True,
charts.overlap_view_adults(obs_df=obs_wbmi, subjid=subjid, param='HEIGHTCM', include_carry_forward=True,
include_percentiles=True, wt_df=wt_percentiles, bmi_df=bmi_percentiles, ht_df=ht_percentiles)
plt.show()


# # Visualizing Multiple Trajectories at Once
Expand Down Expand Up @@ -357,12 +359,12 @@ def handle_selection_change(_event, _widget):
#
# This tool can be used to create samples that are tailored to specific interests. Views can easily be created on existing DataFrames and be used to generate different samples. Functionality available is described in the [Pandas DataFrame documentation](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html).
#
# The cell below selects all observations with a weight exclusion of "Exclude-Moderate-EWMA". It then sorts by weight in descending order. The code then takes the top 50 values and selects 25 random, unique `subjids` from that set. Finally it plots the results. If there are fewer examples than 25, no chart is generated.
# The cell below selects all observations with a weight exclusion of "Exclude-Moderate-EWMA". It then sorts by weight in descending order. The code then takes the top 50 values and selects 25 random, unique `subjids` from that set. Finally it plots the results. If there are fewer than 25 examples, but at least one, each example is shown.

# In[28]:


top_weight_moderate_ewma_ids = merged_df[merged_df.weight_cat == 'Exclude-Moderate-EWMA'].sort_values('weight', ascending=False).head(50)['subjid'].unique()
top_weight_moderate_ewma_ids = merged_df[merged_df.weight_cat == 'Exclude-Adult-EWMA-Moderate'].sort_values('weight', ascending=False).head(50)['subjid'].unique()
if len(top_weight_moderate_ewma_ids) >= 25:
ewma_sample = np.random.choice(top_weight_moderate_ewma_ids, size=25, replace=False)
charts.five_by_five_view(obs, ewma_sample, 'WEIGHTKG', wt_percentiles, ht_percentiles, bmi_percentiles, 'dotted')
Expand All @@ -384,7 +386,8 @@ def edge25(obs, category, group, sort_order, param):
else:
filtered_sum = filtered_sum.nsmallest(25, 'min_measure')
filtered_sum.sort_values(by=[sort_order, 'subjid'], inplace=True)
return charts.five_by_five_view(obs, filtered_sum.subjid.values, param, wt_percentiles, ht_percentiles, bmi_percentiles, 'dotted')
fig = charts.five_by_five_view(obs, filtered_sum.subjid.values, param, wt_percentiles, ht_percentiles, bmi_percentiles, 'dotted')
plt.show()

interact(edge25, obs = fixed(obs_wbmi_mult), category = obs.clean_cat.unique(),
group = ['largest', 'smallest'], sort_order = ['max_measure', 'min_measure', 'start_age', 'axis_range'], param = ['WEIGHTKG', 'HEIGHTCM', 'BMI'])
Expand All @@ -398,7 +401,7 @@ def edge25(obs, category, group, sort_order, param):


all_ids = obs_wbmi['subjid'].unique()
val = 143216308 if 143216308 in all_ids else np.random.choice(all_ids, size=1, replace=False)
val = 2431 if 2431 in all_ids else np.random.choice(all_ids, size=1, replace=False)
interact(charts.param_with_percentiles, merged_df = fixed(obs_wbmi),
subjid = widgets.Dropdown(options=all_ids, value=val,
description='Subject ID:',disabled=False),
Expand Down Expand Up @@ -446,12 +449,12 @@ def edge25(obs, category, group, sort_order, param):
include_missing_values = widgets.Checkbox(value=False,description='Include Missing / Zero Heights and Weights',disabled=False,indent=False)
hbox = widgets.HBox([min_toggle, mean_toggle, max_toggle, std_toggle, count_toggle, diff_toggle])
ui = widgets.VBox([age_range, hbox, include_missing_values])
out = widgets.Output()
sum_out = widgets.Output()
widgets.interactive_output(sumstats.bmi_stats, {'merged_df': fixed(merged_df), 'include_min': min_toggle,
'include_mean': mean_toggle, 'include_max': max_toggle, 'include_std': std_toggle,
'include_mean_diff': diff_toggle, 'include_count': count_toggle,
'out': fixed(out), 'age_range': age_range, 'include_missing': include_missing_values})
display(ui, out)
'out': fixed(sum_out), 'age_range': age_range, 'include_missing': include_missing_values})
display(ui, sum_out)


# # Exporting Data
Expand Down Expand Up @@ -480,22 +483,20 @@ def edge25(obs, category, group, sort_order, param):
df_selector = widgets.Dropdown(options=processdata.data_frame_names(locals()), description='Data Frames')
generate_button = widgets.Button(description='Generate CSV')
ui = widgets.VBox([df_selector, generate_button])
out = widgets.Output()
csv_out = widgets.Output()

l = locals()
def on_button_clicked(b):
processdata.export_to_csv(l, df_selector, out)
processdata.export_to_csv(l, df_selector, csv_out)

generate_button.on_click(on_button_clicked)

display(ui, out)
display(ui, csv_out)


# # Post Processing Data
#
# This tool provides functions that allow the post processing of data. `processdata.clean_swapped_values` will look in a DataFrame for rows where the `height_cat` and `weight_cat` are set to "Exclude-Swaps-RV". It will then swap the `height` and `weight` values for those rows. It will also create two new columns: `postprocess_height_cat` and `postprocess_weight_cat`. The values for these columns is copied from the original categories except in the case where swaps are fixed when it is set to "Include-Fixed-Swap".
#
# `processdata.clean_unit_errors` will look in a data frame for rows where the `height_cat` and `weight_cat` are set to "Exclude-Unit-Errors". It will divide or multiply the value to convert it to metric.
# This tool provides functions that allow the post processing of data. `processdata.clean_swapped_values` will look in a DataFrame for rows where the `height_cat` and `weight_cat` are both flagged for exclusions with "`Exclude-Adult-Swapped-Measurements`". It will then swap the `height` and `weight` values for those rows, and recalculate BMI. It will also create two new columns: `postprocess_height_cat` and `postprocess_weight_cat`. The values for these columns is copied from the original categories except in the case where swaps are fixed when it is set to "`Include-Fixed-Swap`".
#
# The cell below copies the merged DataFrame and then cleans the swapped values.

Expand All @@ -504,27 +505,17 @@ def on_button_clicked(b):

cleaned = merged_df.copy()
cleaned = processdata.clean_swapped_values(cleaned)
cleaned[cleaned.weight_cat == 'Exclude-Swaps-RV'].head()


# The cell below copies the merged DataFrame and then cleans the unit errors. Note: To see results in the table below with the example data you may need to swap "clean_with_swaps.csv" for "clean_with_uswaps.csv" and rerun the cells in the "Loading Data" section above. The default example set has swaps but not unit errors.

# In[34]:


cleaned = merged_df.copy()
cleaned = processdata.clean_unit_errors(cleaned)
cleaned[cleaned.height_cat == 'Exclude-Unit-Errors'].head()
cleaned[cleaned.weight_cat == 'Exclude-Adult-Swapped-Measurements'].head()


# # Developing New Visualizations
#
# Users may take advantage of the predefined `sumstats.bmi_stats`, `charts.bmi_with_percentiles`, `charts.five_by_five_view`, `charts.overlap_view_adults` and `charts.top_ten` functions. For more information on these functions, execute the function name ending with a "?", which will bring up the inline help window. For example, `charts.five_by_five_view`

# In[35]:
# In[34]:


get_ipython().run_line_magic('pinfo', 'sumstats.bmi_stats')


# In addition, users can take advantage of all of the plotting capabilities of [Matplotlib](https://matplotlib.org/3.1.1/tutorials/index.html) and [Seaborn](https://seaborn.pydata.org/tutorial.html)
# In addition, users can take advantage of all of the plotting capabilities of [Matplotlib](https://matplotlib.org/stable/tutorials/index.html) and [Seaborn](https://seaborn.pydata.org/tutorial.html).
1,096 changes: 494 additions & 602 deletions GrowthViz-pediatrics.ipynb

Large diffs are not rendered by default.

44 changes: 25 additions & 19 deletions GrowthViz-pediatrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,17 +224,17 @@
'BMIz': { 'width': 30 },
}
g = qgrid.show_grid(charts.top_ten(mdf, 'weight'), precision=3, column_options=col_opt, column_definitions=col_def)
out = widgets.Output()
ind_out = widgets.Output()
def handle_selection_change(_event, _widget):
sdf = g.get_selected_df()
out.clear_output()
ind_out.clear_output()
if sdf.shape[0] >= 1:
subjid = sdf.subjid.iloc[0]
with out:
with ind_out:
charts.overlap_view_pediatrics(obs, subjid, 'WEIGHTKG', True, True, wt_percentiles, ht_percentiles)
display(plt.show())
g.on('selection_changed', handle_selection_change)
widgets.VBox([g, out])
widgets.VBox([g, ind_out])


# It can be useful to copy values from the `subjid` column in the results above for use in visualizations in the rest of the tool.
Expand All @@ -251,13 +251,17 @@ def handle_selection_change(_event, _widget):


all_ids = obs['subjid'].unique()
val = 47085108 if 47085108 in all_ids else np.random.choice(all_ids, size=1, replace=False)
interactive(charts.overlap_view_pediatrics, obs_df=fixed(obs),
val = 5450 if 5450 in all_ids else np.random.choice(all_ids, size=1, replace=False)
interactive(charts.overlap_view_pediatrics_show,
obs_df=fixed(obs),
subjid=widgets.Dropdown(options=all_ids, value=val, description='Subject ID:', disabled=False),
param=['HEIGHTCM', 'WEIGHTKG'],
include_carry_forward=widgets.Checkbox(value=True,description='Include Carry Forward',disabled=False,indent=False),
include_percentiles=widgets.Checkbox(value=True,description='Include Measurement Percentile Bands',disabled=False,indent=False),
wt_df=fixed(wt_percentiles), ht_df=fixed(ht_percentiles), bmi_df=fixed(bmi_percentiles))
wt_df=fixed(wt_percentiles),
ht_df=fixed(ht_percentiles),
bmi_df=fixed(bmi_percentiles)
)


# The cell below also creates a plot for an individual modeled after the [CDC paper growth charts](https://www.cdc.gov/growthcharts/data/set1clinical/cj41c021.pdf). It shows both the weight trajectory and height trajectory. The lighter bands in the diagram background represent the 5th through 95th percentile values for age and sex for the given measurement type.
Expand All @@ -268,8 +272,9 @@ def handle_selection_change(_event, _widget):


all_ids = obs['subjid'].unique()
val = 47085108 if 47085108 in all_ids else np.random.choice(all_ids, size=1, replace=False)
interactive(charts.overlap_view_double_pediatrics, obs_df=fixed(obs),
val = 5446 if 5446 in all_ids else np.random.choice(all_ids, size=1, replace=False)
interactive(charts.overlap_view_double_pediatrics,
obs_df=fixed(obs),
subjid=widgets.Dropdown(options=all_ids, value=val, description='Subject ID:', disabled=False),
show_all_measurements=widgets.Checkbox(value=True,description='Show All Measurements',disabled=False,indent=False),
show_excluded_values=widgets.Checkbox(value=True,description='Show Excluded Values (x)',disabled=False,indent=False),
Expand Down Expand Up @@ -316,14 +321,14 @@ def handle_selection_change(_event, _widget):
#
# This tool can be used to create samples that are tailored to specific interests. Views can easily be created on existing DataFrames and be used to generate different samples. Functionality available is described in the [Pandas DataFrame documentation](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html).
#
# The cell below selects all observations with a weight exclusion of "Exclude-EWMA-Extreme". It then sorts by weight in descending order. The code then takes the top 50 values and selects 25 random, unique `subjids` from that set. Finally it plots the results. If there are fewer examples than 25, no chart is generated.
# The cell below selects all observations with a weight exclusion of "Exclude-EWMA-Extreme". It then sorts by weight in descending order. The code then takes the top 50 values and selects 25 random, unique `subjids` from that set. Finally it plots the results. If there are fewer than 25 examples, but at least one, each example is shown.

# In[24]:


top_weight_extreme_ewma_ids = merged_df[merged_df.weight_cat == 'Exclude-EWMA-Extreme'].sort_values('weight', ascending=False).head(50)['subjid'].unique()
if len(top_weight_extreme_ewma_ids) >= 25:
ewma_sample = np.random.choice(top_weight_extreme_ewma_ids, size=25, replace=False)
if len(top_weight_extreme_ewma_ids) >= 1:
ewma_sample = np.random.choice(top_weight_extreme_ewma_ids, size=len(top_weight_extreme_ewma_ids), replace=False)
charts.five_by_five_view(obs, ewma_sample, 'WEIGHTKG', wt_percentiles, ht_percentiles, bmi_percentiles, 'solid')


Expand All @@ -340,7 +345,8 @@ def edge25(obs, category, sort_order, param):
filtered_by_cat = filtered_by_cat.nlargest(25, 'measurement')
else:
filtered_by_cat = filtered_by_cat.nsmallest(25, 'measurement')
return charts.five_by_five_view(obs, filtered_by_cat.subjid.values, param, wt_percentiles, ht_percentiles, bmi_percentiles, 'solid')
fig = charts.five_by_five_view(obs, filtered_by_cat.subjid.values, param, wt_percentiles, ht_percentiles, bmi_percentiles, 'solid')
plt.show()

interact(edge25, obs = fixed(obs), category = obs.clean_cat.unique(),
sort_order = ['largest', 'smallest'], param = ['WEIGHTKG', 'HEIGHTCM'])
Expand Down Expand Up @@ -406,12 +412,12 @@ def edge25(obs, category, sort_order, param):
include_missing_values = widgets.Checkbox(value=False,description='Include Missing / Zero Heights and Weights',disabled=False,indent=False)
hbox = widgets.HBox([min_toggle, mean_toggle, max_toggle, std_toggle, count_toggle, diff_toggle])
ui = widgets.VBox([age_range, hbox, include_missing_values])
out = widgets.Output()
sum_out = widgets.Output()
widgets.interactive_output(sumstats.bmi_stats, {'merged_df': fixed(merged_df), 'include_min': min_toggle,
'include_mean': mean_toggle, 'include_max': max_toggle, 'include_std': std_toggle,
'include_mean_diff': diff_toggle, 'include_count': count_toggle,
'out': fixed(out), 'age_range': age_range, 'include_missing': include_missing_values})
display(ui, out)
'out': fixed(sum_out), 'age_range': age_range, 'include_missing': include_missing_values})
display(ui, sum_out)


# # Exporting Data
Expand All @@ -435,15 +441,15 @@ def edge25(obs, category, sort_order, param):
df_selector = widgets.Dropdown(options=processdata.data_frame_names(locals()), description='Data Frames')
generate_button = widgets.Button(description='Generate CSV')
ui = widgets.VBox([df_selector, generate_button])
out = widgets.Output()
csv_out = widgets.Output()

l = locals()
def on_button_clicked(b):
processdata.export_to_csv(l, df_selector, out)
processdata.export_to_csv(l, df_selector, csv_out)

generate_button.on_click(on_button_clicked)

display(ui, out)
display(ui, csv_out)


# # Post Processing Data
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,10 @@ If not using Anaconda, specific versions of packages can be found in `requiremen

By default when you reach Step 6 of the [Simple Install](#simple-install)
instructions above the notebook will use sample data loaded from the `.csv`
files located in the GrowthViz-master project.
files located in the GrowthViz project. This is the same synthetic sample data
that is packaged with
[growthcleanr](https://github.com/carriedaymont/growthcleanr), cleaned and
then separated into pediatric and adult sets for GrowthViz.

To ensure that all of the necessary example files are present, run the
`check_setup.py` script.
Expand Down
Binary file modified README.pdf
Binary file not shown.
Loading

0 comments on commit 90f4681

Please sign in to comment.