Merge pull request #50 from mitre/t37-update-sample-data

Update sample data, closes #37, #51, #52
mitre · Jul 2, 2021 · 90f4681 · 90f4681
2 parents 481187d + dba5b93
commit 90f4681
Show file tree

Hide file tree

Showing 13 changed files with 81,300 additions and 87,825 deletions.
diff --git a/.gitignore b/.gitignore
@@ -145,3 +145,4 @@ cython_debug/
 *.swo
 *.swp
 .DS_Store
+output
diff --git a/GrowthViz-adults.ipynb b/GrowthViz-adults.ipynb
diff --git a/GrowthViz-adults.py b/GrowthViz-adults.py
@@ -263,17 +263,17 @@
     'BMIz': { 'width': 30 },
 }
 g = qgrid.show_grid(charts.top_ten(mdf, 'weight'), precision=3, column_options=col_opt, column_definitions=col_def)
-out = widgets.Output()
+ind_out = widgets.Output()
 def handle_selection_change(_event, _widget):
     sdf = g.get_selected_df()
-    out.clear_output()
+    ind_out.clear_output()
     if sdf.shape[0] >= 1:
         subjid = sdf.subjid.iloc[0]
-        with out:
+        with ind_out:
             charts.overlap_view_adults(obs, subjid, 'WEIGHTKG', True, True, wt_percentiles, bmi_percentiles, ht_percentiles)
             display(plt.show())
 g.on('selection_changed', handle_selection_change)    
-widgets.VBox([g, out])
+widgets.VBox([g, ind_out])
 
 
 # It can be useful to copy values from the `subjid` column in the results above for use in visualizations in the rest of the tool.
@@ -290,8 +290,8 @@ def handle_selection_change(_event, _widget):
 
 
 all_ids = obs['subjid'].unique()
-val = 'd88d3987-93ff-0820-286f-754cd971012d' if 'd88d3987-93ff-0820-286f-754cd971012d' in all_ids else np.random.choice(all_ids, size=1, replace=False) # another good id: 25477664
-interactive(charts.overlap_view_adults, obs_df=fixed(obs_wbmi), 
+val = 2868 if 2868 in all_ids else np.random.choice(all_ids, size=1, replace=False)
+interactive(charts.overlap_view_adults_show, obs_df=fixed(obs_wbmi), 
             subjid=widgets.Dropdown(options=all_ids, value=val, description='Subject ID:', disabled=False), 
             param=['HEIGHTCM', 'WEIGHTKG', 'BMI'], 
             include_carry_forward=widgets.Checkbox(value=True,description='Include Carry Forward',disabled=False,indent=False), 
@@ -302,21 +302,23 @@ def handle_selection_change(_event, _widget):
 # In[22]:
 
 
-obs_wbmi[obs_wbmi['subjid'] == 'd88d3987-93ff-0820-286f-754cd971012d'] # b5a84a9d-dd7c-95cb-5fd9-3c581a72c812, 867a461b-7cb8-76aa-9891-42369a9899e8 is an example with the underweight line
+obs_wbmi[obs_wbmi['subjid'] == 2868]
 
 
 # In[23]:
 
 
-# display all charts at the same time
-charts.overlap_view_adults(obs_df=obs_wbmi, subjid=val, param='WEIGHTKG', include_carry_forward=True, 
+@interact(subjid=widgets.Dropdown(options=all_ids, value=val, description='Subject ID:', disabled=False))
+def all_charts(subjid=val):
+    charts.overlap_view_adults(obs_df=obs_wbmi, subjid=subjid, param='WEIGHTKG', include_carry_forward=True, 
             include_percentiles=True, wt_df=wt_percentiles, bmi_df=bmi_percentiles, ht_df=ht_percentiles)
 
-charts.overlap_view_adults(obs_df=obs_wbmi, subjid=val, param='BMI', include_carry_forward=True, 
+    charts.overlap_view_adults(obs_df=obs_wbmi, subjid=subjid, param='BMI', include_carry_forward=True, 
             include_percentiles=True, wt_df=wt_percentiles, bmi_df=bmi_percentiles, ht_df=ht_percentiles)
 
-charts.overlap_view_adults(obs_df=obs_wbmi, subjid=val, param='HEIGHTCM', include_carry_forward=True, 
+    charts.overlap_view_adults(obs_df=obs_wbmi, subjid=subjid, param='HEIGHTCM', include_carry_forward=True, 
             include_percentiles=True, wt_df=wt_percentiles, bmi_df=bmi_percentiles, ht_df=ht_percentiles)
+    plt.show()
 
 
 # # Visualizing Multiple Trajectories at Once
@@ -357,12 +359,12 @@ def handle_selection_change(_event, _widget):
 # 
 # This tool can be used to create samples that are tailored to specific interests. Views can easily be created on existing DataFrames and be used to generate different samples. Functionality available is described in the [Pandas DataFrame documentation](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html).
 # 
-# The cell below selects all observations with a weight exclusion of "Exclude-Moderate-EWMA". It then sorts by weight in descending order. The code then takes the top 50 values and selects 25 random, unique `subjids` from that set. Finally it plots the results. If there are fewer examples than 25, no chart is generated. 
+# The cell below selects all observations with a weight exclusion of "Exclude-Moderate-EWMA". It then sorts by weight in descending order. The code then takes the top 50 values and selects 25 random, unique `subjids` from that set. Finally it plots the results. If there are fewer than 25 examples, but at least one, each example is shown. 
 
 # In[28]:
 
 
-top_weight_moderate_ewma_ids = merged_df[merged_df.weight_cat == 'Exclude-Moderate-EWMA'].sort_values('weight', ascending=False).head(50)['subjid'].unique()
+top_weight_moderate_ewma_ids = merged_df[merged_df.weight_cat == 'Exclude-Adult-EWMA-Moderate'].sort_values('weight', ascending=False).head(50)['subjid'].unique()
 if len(top_weight_moderate_ewma_ids) >= 25:
     ewma_sample = np.random.choice(top_weight_moderate_ewma_ids, size=25, replace=False)
     charts.five_by_five_view(obs, ewma_sample, 'WEIGHTKG', wt_percentiles, ht_percentiles, bmi_percentiles, 'dotted')
@@ -384,7 +386,8 @@ def edge25(obs, category, group, sort_order, param):
     else:
         filtered_sum = filtered_sum.nsmallest(25, 'min_measure')
     filtered_sum.sort_values(by=[sort_order, 'subjid'], inplace=True)
-    return charts.five_by_five_view(obs, filtered_sum.subjid.values, param, wt_percentiles, ht_percentiles, bmi_percentiles, 'dotted')
+    fig = charts.five_by_five_view(obs, filtered_sum.subjid.values, param, wt_percentiles, ht_percentiles, bmi_percentiles, 'dotted')
+    plt.show()
 
 interact(edge25, obs = fixed(obs_wbmi_mult), category = obs.clean_cat.unique(), 
          group = ['largest', 'smallest'], sort_order = ['max_measure', 'min_measure', 'start_age', 'axis_range'], param = ['WEIGHTKG', 'HEIGHTCM', 'BMI'])
@@ -398,7 +401,7 @@ def edge25(obs, category, group, sort_order, param):
 
 
 all_ids = obs_wbmi['subjid'].unique()
-val = 143216308 if 143216308 in all_ids else np.random.choice(all_ids, size=1, replace=False)
+val = 2431 if 2431 in all_ids else np.random.choice(all_ids, size=1, replace=False)
 interact(charts.param_with_percentiles, merged_df = fixed(obs_wbmi),
          subjid = widgets.Dropdown(options=all_ids, value=val,
                                          description='Subject ID:',disabled=False), 
@@ -446,12 +449,12 @@ def edge25(obs, category, group, sort_order, param):
 include_missing_values = widgets.Checkbox(value=False,description='Include Missing / Zero Heights and Weights',disabled=False,indent=False)
 hbox = widgets.HBox([min_toggle, mean_toggle, max_toggle, std_toggle, count_toggle, diff_toggle])
 ui = widgets.VBox([age_range, hbox, include_missing_values])
-out = widgets.Output()
+sum_out = widgets.Output()
 widgets.interactive_output(sumstats.bmi_stats, {'merged_df': fixed(merged_df), 'include_min': min_toggle, 
          'include_mean': mean_toggle, 'include_max': max_toggle, 'include_std': std_toggle, 
          'include_mean_diff': diff_toggle, 'include_count': count_toggle,
-         'out': fixed(out), 'age_range': age_range, 'include_missing': include_missing_values})
-display(ui, out)
+         'out': fixed(sum_out), 'age_range': age_range, 'include_missing': include_missing_values})
+display(ui, sum_out)
 
 
 # # Exporting Data
@@ -480,22 +483,20 @@ def edge25(obs, category, group, sort_order, param):
 df_selector = widgets.Dropdown(options=processdata.data_frame_names(locals()), description='Data Frames')
 generate_button = widgets.Button(description='Generate CSV')
 ui = widgets.VBox([df_selector, generate_button])
-out = widgets.Output()
+csv_out = widgets.Output()
 
 l = locals()
 def on_button_clicked(b):
-    processdata.export_to_csv(l, df_selector, out)
+    processdata.export_to_csv(l, df_selector, csv_out)
 
 generate_button.on_click(on_button_clicked)
 
-display(ui, out)
+display(ui, csv_out)
 
 
 # # Post Processing Data
 # 
-# This tool provides functions that allow the post processing of data. `processdata.clean_swapped_values` will look in a DataFrame for rows where the `height_cat` and `weight_cat` are set to "Exclude-Swaps-RV". It will then swap the `height` and `weight` values for those rows. It will also create two new columns: `postprocess_height_cat` and `postprocess_weight_cat`. The values for these columns is copied from the original categories except in the case where swaps are fixed when it is set to "Include-Fixed-Swap".
-# 
-# `processdata.clean_unit_errors` will look in a data frame for rows where the `height_cat` and `weight_cat` are set to "Exclude-Unit-Errors". It will divide or multiply the value to convert it to metric.
+# This tool provides functions that allow the post processing of data. `processdata.clean_swapped_values` will look in a DataFrame for rows where the `height_cat` and `weight_cat` are both flagged for exclusions with "`Exclude-Adult-Swapped-Measurements`". It will then swap the `height` and `weight` values for those rows, and recalculate BMI. It will also create two new columns: `postprocess_height_cat` and `postprocess_weight_cat`. The values for these columns is copied from the original categories except in the case where swaps are fixed when it is set to "`Include-Fixed-Swap`".
 # 
 # The cell below copies the merged DataFrame and then cleans the swapped values.
 
@@ -504,27 +505,17 @@ def on_button_clicked(b):
 
 cleaned = merged_df.copy()
 cleaned = processdata.clean_swapped_values(cleaned)
-cleaned[cleaned.weight_cat == 'Exclude-Swaps-RV'].head()
-
-
-# The cell below copies the merged DataFrame and then cleans the unit errors. Note: To see results in the table below with the example data you may need to swap "clean_with_swaps.csv" for "clean_with_uswaps.csv" and rerun the cells in the "Loading Data" section above. The default example set has swaps but not unit errors.
-
-# In[34]:
-
-
-cleaned = merged_df.copy()
-cleaned = processdata.clean_unit_errors(cleaned)
-cleaned[cleaned.height_cat == 'Exclude-Unit-Errors'].head()
+cleaned[cleaned.weight_cat == 'Exclude-Adult-Swapped-Measurements'].head()
 
 
 # # Developing New Visualizations
 # 
 # Users may take advantage of the predefined `sumstats.bmi_stats`, `charts.bmi_with_percentiles`, `charts.five_by_five_view`, `charts.overlap_view_adults` and `charts.top_ten` functions. For more information on these functions, execute the function name ending with a "?", which will bring up the inline help window. For example, `charts.five_by_five_view`
 
-# In[35]:
+# In[34]:
 
 
 get_ipython().run_line_magic('pinfo', 'sumstats.bmi_stats')
 
 
-# In addition, users can take advantage of all of the plotting capabilities of [Matplotlib](https://matplotlib.org/3.1.1/tutorials/index.html) and [Seaborn](https://seaborn.pydata.org/tutorial.html)
+# In addition, users can take advantage of all of the plotting capabilities of [Matplotlib](https://matplotlib.org/stable/tutorials/index.html) and [Seaborn](https://seaborn.pydata.org/tutorial.html).
diff --git a/GrowthViz-pediatrics.ipynb b/GrowthViz-pediatrics.ipynb
diff --git a/GrowthViz-pediatrics.py b/GrowthViz-pediatrics.py
@@ -224,17 +224,17 @@
     'BMIz': { 'width': 30 },
 }
 g = qgrid.show_grid(charts.top_ten(mdf, 'weight'), precision=3, column_options=col_opt, column_definitions=col_def)
-out = widgets.Output()
+ind_out = widgets.Output()
 def handle_selection_change(_event, _widget):
     sdf = g.get_selected_df()
-    out.clear_output()
+    ind_out.clear_output()
     if sdf.shape[0] >= 1:
         subjid = sdf.subjid.iloc[0]
-        with out:
+        with ind_out:
             charts.overlap_view_pediatrics(obs, subjid, 'WEIGHTKG', True, True, wt_percentiles, ht_percentiles)
             display(plt.show())
 g.on('selection_changed', handle_selection_change)    
-widgets.VBox([g, out])
+widgets.VBox([g, ind_out])
 
 
 # It can be useful to copy values from the `subjid` column in the results above for use in visualizations in the rest of the tool.
@@ -251,13 +251,17 @@ def handle_selection_change(_event, _widget):
 
 
 all_ids = obs['subjid'].unique()
-val = 47085108 if 47085108 in all_ids else np.random.choice(all_ids, size=1, replace=False)
-interactive(charts.overlap_view_pediatrics, obs_df=fixed(obs), 
+val = 5450 if 5450 in all_ids else np.random.choice(all_ids, size=1, replace=False)
+interactive(charts.overlap_view_pediatrics_show, 
+            obs_df=fixed(obs), 
             subjid=widgets.Dropdown(options=all_ids, value=val, description='Subject ID:', disabled=False), 
             param=['HEIGHTCM', 'WEIGHTKG'], 
             include_carry_forward=widgets.Checkbox(value=True,description='Include Carry Forward',disabled=False,indent=False), 
             include_percentiles=widgets.Checkbox(value=True,description='Include Measurement Percentile Bands',disabled=False,indent=False),
-            wt_df=fixed(wt_percentiles), ht_df=fixed(ht_percentiles), bmi_df=fixed(bmi_percentiles))
+            wt_df=fixed(wt_percentiles), 
+            ht_df=fixed(ht_percentiles), 
+            bmi_df=fixed(bmi_percentiles)
+           )
 
 
 # The cell below also creates a plot for an individual modeled after the [CDC paper growth charts](https://www.cdc.gov/growthcharts/data/set1clinical/cj41c021.pdf). It shows both the weight trajectory and height trajectory. The lighter bands in the diagram background represent the 5th through 95th percentile values for age and sex for the given measurement type.
@@ -268,8 +272,9 @@ def handle_selection_change(_event, _widget):
 
 
 all_ids = obs['subjid'].unique()
-val = 47085108 if 47085108 in all_ids else np.random.choice(all_ids, size=1, replace=False)
-interactive(charts.overlap_view_double_pediatrics, obs_df=fixed(obs), 
+val = 5446 if 5446 in all_ids else np.random.choice(all_ids, size=1, replace=False)
+interactive(charts.overlap_view_double_pediatrics, 
+            obs_df=fixed(obs), 
             subjid=widgets.Dropdown(options=all_ids, value=val, description='Subject ID:', disabled=False),
             show_all_measurements=widgets.Checkbox(value=True,description='Show All Measurements',disabled=False,indent=False),
             show_excluded_values=widgets.Checkbox(value=True,description='Show Excluded Values (x)',disabled=False,indent=False),
@@ -316,14 +321,14 @@ def handle_selection_change(_event, _widget):
 # 
 # This tool can be used to create samples that are tailored to specific interests. Views can easily be created on existing DataFrames and be used to generate different samples. Functionality available is described in the [Pandas DataFrame documentation](https://pandas.pydata.org/pandas-docs/stable/reference/frame.html).
 # 
-# The cell below selects all observations with a weight exclusion of "Exclude-EWMA-Extreme". It then sorts by weight in descending order. The code then takes the top 50 values and selects 25 random, unique `subjids` from that set. Finally it plots the results. If there are fewer examples than 25, no chart is generated. 
+# The cell below selects all observations with a weight exclusion of "Exclude-EWMA-Extreme". It then sorts by weight in descending order. The code then takes the top 50 values and selects 25 random, unique `subjids` from that set. Finally it plots the results. If there are fewer than 25 examples, but at least one, each example is shown. 
 
 # In[24]:
 
 
 top_weight_extreme_ewma_ids = merged_df[merged_df.weight_cat == 'Exclude-EWMA-Extreme'].sort_values('weight', ascending=False).head(50)['subjid'].unique()
-if len(top_weight_extreme_ewma_ids) >= 25:
-    ewma_sample = np.random.choice(top_weight_extreme_ewma_ids, size=25, replace=False)
+if len(top_weight_extreme_ewma_ids) >= 1:
+    ewma_sample = np.random.choice(top_weight_extreme_ewma_ids, size=len(top_weight_extreme_ewma_ids), replace=False)
     charts.five_by_five_view(obs, ewma_sample, 'WEIGHTKG', wt_percentiles, ht_percentiles, bmi_percentiles, 'solid')
 
 
@@ -340,7 +345,8 @@ def edge25(obs, category, sort_order, param):
         filtered_by_cat = filtered_by_cat.nlargest(25, 'measurement')
     else:
         filtered_by_cat = filtered_by_cat.nsmallest(25, 'measurement')
-    return charts.five_by_five_view(obs, filtered_by_cat.subjid.values, param, wt_percentiles, ht_percentiles, bmi_percentiles, 'solid')
+    fig = charts.five_by_five_view(obs, filtered_by_cat.subjid.values, param, wt_percentiles, ht_percentiles, bmi_percentiles, 'solid')
+    plt.show()
 
 interact(edge25, obs = fixed(obs), category = obs.clean_cat.unique(), 
          sort_order = ['largest', 'smallest'], param = ['WEIGHTKG', 'HEIGHTCM'])
@@ -406,12 +412,12 @@ def edge25(obs, category, sort_order, param):
 include_missing_values = widgets.Checkbox(value=False,description='Include Missing / Zero Heights and Weights',disabled=False,indent=False)
 hbox = widgets.HBox([min_toggle, mean_toggle, max_toggle, std_toggle, count_toggle, diff_toggle])
 ui = widgets.VBox([age_range, hbox, include_missing_values])
-out = widgets.Output()
+sum_out = widgets.Output()
 widgets.interactive_output(sumstats.bmi_stats, {'merged_df': fixed(merged_df), 'include_min': min_toggle, 
          'include_mean': mean_toggle, 'include_max': max_toggle, 'include_std': std_toggle, 
          'include_mean_diff': diff_toggle, 'include_count': count_toggle,
-         'out': fixed(out), 'age_range': age_range, 'include_missing': include_missing_values})
-display(ui, out)
+         'out': fixed(sum_out), 'age_range': age_range, 'include_missing': include_missing_values})
+display(ui, sum_out)
 
 
 # # Exporting Data
@@ -435,15 +441,15 @@ def edge25(obs, category, sort_order, param):
 df_selector = widgets.Dropdown(options=processdata.data_frame_names(locals()), description='Data Frames')
 generate_button = widgets.Button(description='Generate CSV')
 ui = widgets.VBox([df_selector, generate_button])
-out = widgets.Output()
+csv_out = widgets.Output()
 
 l = locals()
 def on_button_clicked(b):
-    processdata.export_to_csv(l, df_selector, out)
+    processdata.export_to_csv(l, df_selector, csv_out)
 
 generate_button.on_click(on_button_clicked)
 
-display(ui, out)
+display(ui, csv_out)
 
 
 # # Post Processing Data

diff --git a/README.md b/README.md
@@ -151,7 +151,10 @@ If not using Anaconda, specific versions of packages can be found in `requiremen
 
 By default when you reach Step 6 of the [Simple Install](#simple-install)
 instructions above the notebook will use sample data loaded from the `.csv`
-files located in the GrowthViz-master project.
+files located in the GrowthViz project. This is the same synthetic sample data
+that is packaged with
+[growthcleanr](https://github.com/carriedaymont/growthcleanr), cleaned and
+then separated into pediatric and adult sets for GrowthViz.
 
 To ensure that all of the necessary example files are present, run the
 `check_setup.py` script.

diff --git a/README.pdf b/README.pdf
-Original file line number
+Diff line change
@@ Expand Up / @@ -145,3 +145,4 @@ cython_debug/ @@
     *.swo
     *.swp
     .DS_Store
+    output