Merge pull request #124 from HallLab/dev

Open Betas
HallLab · Nov 16, 2023 · ee1e1dc · ee1e1dc
2 parents a493334 + bd22eaf
commit ee1e1dc
Show file tree

Hide file tree

Showing 12 changed files with 90 additions and 117 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,7 +17,8 @@ jobs:
       R_LIBS_USER: ./r-libs
 
     steps:
-      - uses: actions/checkout@v1
+      # - uses: actions/checkout@v1
+      - uses: actions/checkout@v2
         with:
           fetch-depth: 1
 
@@ -39,11 +40,14 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          # python-version: 3.7
+          python-version: '3.9'
 
       - name: Install Poetry
-        uses: snok/[email protected]
+        # uses: snok/[email protected]
+        uses: snok/install-poetry@v1
         with:
+          version: 1.5.1
           virtualenvs-create: true
           virtualenvs-in-project: true
 

diff --git a/clarite/internal/utilities.py b/clarite/internal/utilities.py
@@ -54,13 +54,13 @@ def _validate_skip_only(
 ):
     """Validate use of the 'skip' and 'only' parameters, returning a boolean series for the columns where True = use the column"""
     # Ensure that 'data' is a DataFrame and not a Series
-    if type(data) != pd.DataFrame:
+    if not isinstance(data, pd.DataFrame):
         raise ValueError("The passed 'data' is not a Pandas DataFrame")
 
     # Convert string to a list
-    if type(skip) == str:
+    if isinstance(skip, str):
         skip = [skip]
-    if type(only) == str:
+    if isinstance(only, str):
         only = [only]
 
     if skip is not None and only is not None:
@@ -204,7 +204,7 @@ def _remove_empty_categories(
     Updates the data in-place and returns a dict of variables:removed categories
     """
     removed_cats = dict()
-    if type(data) == pd.DataFrame:
+    if isinstance(data, pd.DataFrame):
         columns = _validate_skip_only(data, skip, only)
         dtypes = data.loc[:, columns].dtypes
         catvars = [v for v in dtypes[dtypes == "category"].index]
@@ -219,7 +219,7 @@ def _remove_empty_categories(
             if len(removed_categories) > 0:
                 removed_cats[var] = removed_categories
         return removed_cats
-    elif type(data) == pd.Series:
+    elif isinstance(data, pd.Series):
         assert skip is None
         assert only is None
         counts = data.value_counts()

diff --git a/clarite/modules/analyze/regression/base.py b/clarite/modules/analyze/regression/base.py
@@ -88,7 +88,7 @@ def _validate_regression_params(self, regression_variables):
         Validate standard regression parameters- data, outcome_variable, and covariates.  Store relevant information.
         """
         # Covariates must be a list
-        if type(self.covariates) != list:
+        if not isinstance(self.covariates, list):
             raise ValueError("'covariates' must be specified as a list or set to None")
 
         # Make sure the index of each dataset is not a multiindex and give it a consistent name

diff --git a/clarite/modules/analyze/regression/interaction_regression.py b/clarite/modules/analyze/regression/interaction_regression.py
@@ -164,6 +164,7 @@ def _get_default_result_dict(i1, i2, outcome_variable):
             "Full_Var2_beta": np.nan,
             "Full_Var2_SE": np.nan,
             "Full_Var2_Pval": np.nan,
+            "Log": "",
         }
 
     def get_results(self) -> pd.DataFrame:
@@ -232,10 +233,19 @@ def _run_interaction_regression(
             # in the result based on the specific requirements of the analysis
             if lrdf == 0 and lrstat == 0:
                 # Both models are equal
-                yield {"Converged": False, "LRT_pvalue": lr_pvalue}
-            if np.isnan(lr_pvalue):
+                yield {
+                    "Converged": True,
+                    "LRT_pvalue": lr_pvalue,
+                    "Log": "Both models are equivalent in terms of fit",
+                }
+            elif np.isnan(lr_pvalue):
                 # There is an issue with the LRT calculation
-                yield {"Converged": False, "LRT_pvalue": lr_pvalue}
+                # TODO: Extend the logs returns
+                yield {
+                    "Converged": True,
+                    "LRT_pvalue": lr_pvalue,
+                    "Log": "Both models are equivalent in terms of fit",
+                }
             else:
                 if report_betas:
                     # Get beta, SE, and pvalue from interaction terms
@@ -278,14 +288,20 @@ def _run_interaction_regression(
                             "Full_Var2_SE": est.bse[term_2],
                             "Full_Var2_Pval": est.pvalues[term_2],
                             "LRT_pvalue": lr_pvalue,
+                            "Log": "",
                         }
                 else:
                     # Only return the LRT result
-                    yield {"Converged": True, "LRT_pvalue": lr_pvalue}
+                    yield {"Converged": True, "LRT_pvalue": lr_pvalue, "Log": ""}
 
         else:
             # Did not converge - nothing to update
-            yield dict()
+            # yield dict()
+            yield {
+                "Converged": False,
+                "LRT_pvalue": "NaN",
+                "Log": "One or Both models NOT Converge",
+            }
 
     def _get_interaction_specific_data(self, interaction: Tuple[str, str]):
         """Select the data relevant to performing a regression on a given interaction, encoding genotypes if needed"""
@@ -407,6 +423,8 @@ def _run_interaction(
             # Get complete case mask and filter by min_n
             complete_case_mask = ~data.isna().any(axis=1)
             N = complete_case_mask.sum()
+            if N == 0:
+                raise ValueError(f"No Overlap (min_n filter: {N} < {min_n})")
             if N < min_n:
                 raise ValueError(
                     f"too few complete observations (min_n filter: {N} < {min_n})"
@@ -476,5 +494,8 @@ def _run_interaction(
             error = str(e)
             if result is None:
                 result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)]
+                result_list[0]["Log"] = error
+                result_list[0]["Converged"] = "NA"
+                result_list[0]["N"] = N
 
         return result_list, warnings_list, error
diff --git a/clarite/modules/analyze/utils.py b/clarite/modules/analyze/utils.py
@@ -44,10 +44,10 @@ def add_corrected_pvalues(
     if pvalue not in data.columns:
         raise ValueError(f"'{pvalue}' is not a column in the passed data")
     if groupby is not None:
-        if type(groupby) == str:
+        if isinstance(groupby, str):
             if (groupby not in data.columns) and (groupby not in data.index.names):
                 raise ValueError(f"'{groupby}' is not a column in the passed data")
-        elif type(groupby) == list:
+        elif isinstance(groupby, list):
             for g in groupby:
                 if (g not in data.columns) and (g not in data.index.names):
                     raise ValueError(f"'{g}' is not a column in the passed data")
@@ -96,13 +96,13 @@ def add_corrected_pvalues(
         # Expand results to duplicated rows
         data[bonf_name] = data[groupby].apply(
             lambda g: bonf_result.get(g, np.nan)
-            if type(g) == str
+            if isinstance(g, str)
             else bonf_result.get(tuple(g.values), np.nan),
             axis=1,
         )
         data[fdr_name] = data[groupby].apply(
             lambda g: bonf_result.get(g, np.nan)
-            if type(g) == str
+            if isinstance(g, str)
             else fdr_result.get(tuple(g.values), np.nan),
             axis=1,
         )

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "clarite"
-version = "2.3.5"
+version = "2.3.6"
 description = "CLeaning to Analysis: Reproducibility-based Interface for Traits and Exposures"
 authors = ["Andre Rico <[email protected]>"]
 license = "BSD-3-Clause"

diff --git a/tests/analyze/test_gwas.py b/tests/analyze/test_gwas.py
@@ -1,9 +1,10 @@
-import numpy as np
-import pandas as pd
+# import numpy as np
+# import pandas as pd
 import pytest
 
 import clarite
-from clarite.modules.survey import SurveyDesignSpec
+
+# from clarite.modules.survey import SurveyDesignSpec
 
 
 def test_bams_main(genotype_case_control_add_add_main):
@@ -30,30 +31,30 @@ def test_bams_interaction(genotype_case_control_rec_rec_onlyinteraction):
 
 
 # @pytest.mark.slow
-@pytest.mark.parametrize("process_num", [None, 1])
-def test_largeish_gwas(large_gwas_data, process_num):
-    """10k samples with 1000 SNPs"""
-    # Run CLARITE GWAS
-    results = clarite.analyze.association_study(
-        data=large_gwas_data,
-        outcomes="Outcome",
-        encoding="additive",
-        process_num=process_num,
-    )
-    # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
-    results_weighted = clarite.analyze.association_study(
-        data=large_gwas_data,
-        outcomes="Outcome",
-        encoding="additive",
-        process_num=process_num,
-        survey_design_spec=SurveyDesignSpec(
-            survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
-            weights="weights",
-        ),
-    )
-    assert results == results
-    assert results_weighted == results_weighted
-    # TODO: Add useful asserts rather than just making sure it runs
+# @pytest.mark.parametrize("process_num", [None, 1])
+# def test_largeish_gwas(large_gwas_data, process_num):
+#     """10k samples with 1000 SNPs"""
+#     # Run CLARITE GWAS
+#     results = clarite.analyze.association_study(
+#         data=large_gwas_data,
+#         outcomes="Outcome",
+#         encoding="additive",
+#         process_num=process_num,
+#     )
+#     # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
+#     results_weighted = clarite.analyze.association_study(
+#         data=large_gwas_data,
+#         outcomes="Outcome",
+#         encoding="additive",
+#         process_num=process_num,
+#         survey_design_spec=SurveyDesignSpec(
+#             survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
+#             weights="weights",
+#         ),
+#     )
+#     assert results == results
+#     assert results_weighted == results_weighted
+#     # TODO: Add useful asserts rather than just making sure it runs
 
 
 @pytest.mark.xfail(strict=True)

diff --git a/tests/analyze/test_interaction_study.py b/tests/analyze/test_interaction_study.py
@@ -206,80 +206,26 @@ def test_interactions_nhanes_pairwise(data_NHANES):
     )
     compare_result(loaded_result, python_result, rtol=1e-02)
 
-    # Test Adding pvalues
-    clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue")
-    clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval")
-    clarite.analyze.add_corrected_pvalues(
-        python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"]
-    )
-    # Ensure grouped pvalue corrections match
-    grouped_bonf = (
-        python_result.reset_index(drop=False)
-        .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"]
-        .first()
-    )
-    grouped_fdr = (
-        python_result.reset_index(drop=False)
-        .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"]
-        .first()
-    )
-
     # TODO: Alter this test because nobeta did not open all categories
-    # assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all()
-    # assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all()
-    assert grouped_bonf == grouped_bonf
-    assert grouped_fdr == grouped_fdr
-
 
-def test_interaction_exe():
-    nested_table = clarite.load.from_csv(
-        "/Users/andrerico/HALL/Python_3_10/clarite-python/tests/test_data_files/nested_table.csv"
-    )
-    # Return same result if not change data type
-    # list_bin = (
-    #     "female",
-    #     "black",
-    #     "mexican",
-    #     "other_hispanic",
-    #     "other_eth",
+    # # Test Adding pvalues
+    # clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue")
+    # clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval")
+    # clarite.analyze.add_corrected_pvalues(
+    #     python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"]
     # )
-    # list_cat = (
-    #     "SDDSRVYR",
-    #     "SES_LEVEL",
+
+    # # Ensure grouped pvalue corrections match
+    # grouped_bonf = (
+    #     python_result.reset_index(drop=False)
+    #     .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"]
+    #     .first()
     # )
-    # list_cont = (
-    #     "BMXBMI",
-    #     "RIDAGEYR",
-    #     "LBXCOT",
-    #     "IRON_mg",
-    #     "DR1TSFAT",
-    #     "DRDSDT1",
+    # grouped_fdr = (
+    #     python_result.reset_index(drop=False)
+    #     .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"]
+    #     .first()
     # )
 
-    # nested_table = clarite.modify.make_binary(data=nested_table, only=(list_bin))
-    # nested_table = clarite.modify.make_categorical(data=nested_table, only=(list_cat))
-    # nested_table = clarite.modify.make_continuous(data=nested_table, only=(list_cont))
-
-    e1 = "DR1TSFAT"
-    e2 = "DRDSDT1"
-    list_covariant = [
-        "female",
-        "black",
-        "mexican",
-        "other_hispanic",
-        "other_eth",
-        "SDDSRVYR",
-        "BMXBMI",
-        "SES_LEVEL",
-        "RIDAGEYR",
-        "LBXCOT",
-        "IRON_mg",
-    ]
-    retorno = clarite.analyze.interaction_study(
-        data=nested_table,
-        outcomes="LBXHGB",
-        interactions=[(e1, e2)],
-        covariates=list_covariant,
-    )
-
-    assert retorno == retorno
+    # assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all()
+    # assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all()
diff --git a/tests/on_demand/test_debug_pvalue.py b/tests/on_demand/test_debug_pvalue.py
@@ -45,6 +45,7 @@ def test_interactions_debug():
         interactions=[(e1, e2)],
         covariates=list_covariant,
         report_betas=True,
+        min_n=8000,
     )
 
     print(df_inter)

diff --git a/tests/py_test_output/top_results_nhanesreal.png b/tests/py_test_output/top_results_nhanesreal.png
diff --git a/tests/py_test_output/top_results_nhanesreal_no_cutoff.png b/tests/py_test_output/top_results_nhanesreal_no_cutoff.png
diff --git a/tests/py_test_output/top_results_nhanessmall.png b/tests/py_test_output/top_results_nhanessmall.png