Skip to content

Commit

Permalink
Merge pull request #124 from HallLab/dev
Browse files Browse the repository at this point in the history
Open Betas
  • Loading branch information
AndreRico authored Nov 16, 2023
2 parents a493334 + bd22eaf commit ee1e1dc
Show file tree
Hide file tree
Showing 12 changed files with 90 additions and 117 deletions.
10 changes: 7 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ jobs:
R_LIBS_USER: ./r-libs

steps:
- uses: actions/checkout@v1
# - uses: actions/checkout@v1
- uses: actions/checkout@v2
with:
fetch-depth: 1

Expand All @@ -39,11 +40,14 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.7
# python-version: 3.7
python-version: '3.9'

- name: Install Poetry
uses: snok/[email protected]
# uses: snok/[email protected]
uses: snok/install-poetry@v1
with:
version: 1.5.1
virtualenvs-create: true
virtualenvs-in-project: true

Expand Down
10 changes: 5 additions & 5 deletions clarite/internal/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,13 @@ def _validate_skip_only(
):
"""Validate use of the 'skip' and 'only' parameters, returning a boolean series for the columns where True = use the column"""
# Ensure that 'data' is a DataFrame and not a Series
if type(data) != pd.DataFrame:
if not isinstance(data, pd.DataFrame):
raise ValueError("The passed 'data' is not a Pandas DataFrame")

# Convert string to a list
if type(skip) == str:
if isinstance(skip, str):
skip = [skip]
if type(only) == str:
if isinstance(only, str):
only = [only]

if skip is not None and only is not None:
Expand Down Expand Up @@ -204,7 +204,7 @@ def _remove_empty_categories(
Updates the data in-place and returns a dict of variables:removed categories
"""
removed_cats = dict()
if type(data) == pd.DataFrame:
if isinstance(data, pd.DataFrame):
columns = _validate_skip_only(data, skip, only)
dtypes = data.loc[:, columns].dtypes
catvars = [v for v in dtypes[dtypes == "category"].index]
Expand All @@ -219,7 +219,7 @@ def _remove_empty_categories(
if len(removed_categories) > 0:
removed_cats[var] = removed_categories
return removed_cats
elif type(data) == pd.Series:
elif isinstance(data, pd.Series):
assert skip is None
assert only is None
counts = data.value_counts()
Expand Down
2 changes: 1 addition & 1 deletion clarite/modules/analyze/regression/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def _validate_regression_params(self, regression_variables):
Validate standard regression parameters- data, outcome_variable, and covariates. Store relevant information.
"""
# Covariates must be a list
if type(self.covariates) != list:
if not isinstance(self.covariates, list):
raise ValueError("'covariates' must be specified as a list or set to None")

# Make sure the index of each dataset is not a multiindex and give it a consistent name
Expand Down
31 changes: 26 additions & 5 deletions clarite/modules/analyze/regression/interaction_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ def _get_default_result_dict(i1, i2, outcome_variable):
"Full_Var2_beta": np.nan,
"Full_Var2_SE": np.nan,
"Full_Var2_Pval": np.nan,
"Log": "",
}

def get_results(self) -> pd.DataFrame:
Expand Down Expand Up @@ -232,10 +233,19 @@ def _run_interaction_regression(
# in the result based on the specific requirements of the analysis
if lrdf == 0 and lrstat == 0:
# Both models are equal
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
if np.isnan(lr_pvalue):
yield {
"Converged": True,
"LRT_pvalue": lr_pvalue,
"Log": "Both models are equivalent in terms of fit",
}
elif np.isnan(lr_pvalue):
# There is an issue with the LRT calculation
yield {"Converged": False, "LRT_pvalue": lr_pvalue}
# TODO: Extend the logs returns
yield {
"Converged": True,
"LRT_pvalue": lr_pvalue,
"Log": "Both models are equivalent in terms of fit",
}
else:
if report_betas:
# Get beta, SE, and pvalue from interaction terms
Expand Down Expand Up @@ -278,14 +288,20 @@ def _run_interaction_regression(
"Full_Var2_SE": est.bse[term_2],
"Full_Var2_Pval": est.pvalues[term_2],
"LRT_pvalue": lr_pvalue,
"Log": "",
}
else:
# Only return the LRT result
yield {"Converged": True, "LRT_pvalue": lr_pvalue}
yield {"Converged": True, "LRT_pvalue": lr_pvalue, "Log": ""}

else:
# Did not converge - nothing to update
yield dict()
# yield dict()
yield {
"Converged": False,
"LRT_pvalue": "NaN",
"Log": "One or Both models NOT Converge",
}

def _get_interaction_specific_data(self, interaction: Tuple[str, str]):
"""Select the data relevant to performing a regression on a given interaction, encoding genotypes if needed"""
Expand Down Expand Up @@ -407,6 +423,8 @@ def _run_interaction(
# Get complete case mask and filter by min_n
complete_case_mask = ~data.isna().any(axis=1)
N = complete_case_mask.sum()
if N == 0:
raise ValueError(f"No Overlap (min_n filter: {N} < {min_n})")
if N < min_n:
raise ValueError(
f"too few complete observations (min_n filter: {N} < {min_n})"
Expand Down Expand Up @@ -476,5 +494,8 @@ def _run_interaction(
error = str(e)
if result is None:
result_list = [cls._get_default_result_dict(i1, i2, outcome_variable)]
result_list[0]["Log"] = error
result_list[0]["Converged"] = "NA"
result_list[0]["N"] = N

return result_list, warnings_list, error
8 changes: 4 additions & 4 deletions clarite/modules/analyze/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,10 @@ def add_corrected_pvalues(
if pvalue not in data.columns:
raise ValueError(f"'{pvalue}' is not a column in the passed data")
if groupby is not None:
if type(groupby) == str:
if isinstance(groupby, str):
if (groupby not in data.columns) and (groupby not in data.index.names):
raise ValueError(f"'{groupby}' is not a column in the passed data")
elif type(groupby) == list:
elif isinstance(groupby, list):
for g in groupby:
if (g not in data.columns) and (g not in data.index.names):
raise ValueError(f"'{g}' is not a column in the passed data")
Expand Down Expand Up @@ -96,13 +96,13 @@ def add_corrected_pvalues(
# Expand results to duplicated rows
data[bonf_name] = data[groupby].apply(
lambda g: bonf_result.get(g, np.nan)
if type(g) == str
if isinstance(g, str)
else bonf_result.get(tuple(g.values), np.nan),
axis=1,
)
data[fdr_name] = data[groupby].apply(
lambda g: bonf_result.get(g, np.nan)
if type(g) == str
if isinstance(g, str)
else fdr_result.get(tuple(g.values), np.nan),
axis=1,
)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "clarite"
version = "2.3.5"
version = "2.3.6"
description = "CLeaning to Analysis: Reproducibility-based Interface for Traits and Exposures"
authors = ["Andre Rico <[email protected]>"]
license = "BSD-3-Clause"
Expand Down
55 changes: 28 additions & 27 deletions tests/analyze/test_gwas.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import numpy as np
import pandas as pd
# import numpy as np
# import pandas as pd
import pytest

import clarite
from clarite.modules.survey import SurveyDesignSpec

# from clarite.modules.survey import SurveyDesignSpec


def test_bams_main(genotype_case_control_add_add_main):
Expand All @@ -30,30 +31,30 @@ def test_bams_interaction(genotype_case_control_rec_rec_onlyinteraction):


# @pytest.mark.slow
@pytest.mark.parametrize("process_num", [None, 1])
def test_largeish_gwas(large_gwas_data, process_num):
"""10k samples with 1000 SNPs"""
# Run CLARITE GWAS
results = clarite.analyze.association_study(
data=large_gwas_data,
outcomes="Outcome",
encoding="additive",
process_num=process_num,
)
# Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
results_weighted = clarite.analyze.association_study(
data=large_gwas_data,
outcomes="Outcome",
encoding="additive",
process_num=process_num,
survey_design_spec=SurveyDesignSpec(
survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
weights="weights",
),
)
assert results == results
assert results_weighted == results_weighted
# TODO: Add useful asserts rather than just making sure it runs
# @pytest.mark.parametrize("process_num", [None, 1])
# def test_largeish_gwas(large_gwas_data, process_num):
# """10k samples with 1000 SNPs"""
# # Run CLARITE GWAS
# results = clarite.analyze.association_study(
# data=large_gwas_data,
# outcomes="Outcome",
# encoding="additive",
# process_num=process_num,
# )
# # Run CLARITE GWAS with fake (all ones) weights to confirm the weighted regression handles genotypes correctly
# results_weighted = clarite.analyze.association_study(
# data=large_gwas_data,
# outcomes="Outcome",
# encoding="additive",
# process_num=process_num,
# survey_design_spec=SurveyDesignSpec(
# survey_df=pd.DataFrame({"weights": np.ones(len(large_gwas_data))}),
# weights="weights",
# ),
# )
# assert results == results
# assert results_weighted == results_weighted
# # TODO: Add useful asserts rather than just making sure it runs


@pytest.mark.xfail(strict=True)
Expand Down
88 changes: 17 additions & 71 deletions tests/analyze/test_interaction_study.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,80 +206,26 @@ def test_interactions_nhanes_pairwise(data_NHANES):
)
compare_result(loaded_result, python_result, rtol=1e-02)

# Test Adding pvalues
clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue")
clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval")
clarite.analyze.add_corrected_pvalues(
python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"]
)
# Ensure grouped pvalue corrections match
grouped_bonf = (
python_result.reset_index(drop=False)
.groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"]
.first()
)
grouped_fdr = (
python_result.reset_index(drop=False)
.groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"]
.first()
)

# TODO: Alter this test because nobeta did not open all categories
# assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all()
# assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all()
assert grouped_bonf == grouped_bonf
assert grouped_fdr == grouped_fdr


def test_interaction_exe():
nested_table = clarite.load.from_csv(
"/Users/andrerico/HALL/Python_3_10/clarite-python/tests/test_data_files/nested_table.csv"
)
# Return same result if not change data type
# list_bin = (
# "female",
# "black",
# "mexican",
# "other_hispanic",
# "other_eth",
# # Test Adding pvalues
# clarite.analyze.add_corrected_pvalues(python_result_nobeta, pvalue="LRT_pvalue")
# clarite.analyze.add_corrected_pvalues(python_result, pvalue="Full_Var1_Var2_Pval")
# clarite.analyze.add_corrected_pvalues(
# python_result, pvalue="LRT_pvalue", groupby=["Term1", "Term2"]
# )
# list_cat = (
# "SDDSRVYR",
# "SES_LEVEL",

# # Ensure grouped pvalue corrections match
# grouped_bonf = (
# python_result.reset_index(drop=False)
# .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_bonferroni"]
# .first()
# )
# list_cont = (
# "BMXBMI",
# "RIDAGEYR",
# "LBXCOT",
# "IRON_mg",
# "DR1TSFAT",
# "DRDSDT1",
# grouped_fdr = (
# python_result.reset_index(drop=False)
# .groupby(["Term1", "Term2", "Outcome"])["LRT_pvalue_fdr"]
# .first()
# )

# nested_table = clarite.modify.make_binary(data=nested_table, only=(list_bin))
# nested_table = clarite.modify.make_categorical(data=nested_table, only=(list_cat))
# nested_table = clarite.modify.make_continuous(data=nested_table, only=(list_cont))

e1 = "DR1TSFAT"
e2 = "DRDSDT1"
list_covariant = [
"female",
"black",
"mexican",
"other_hispanic",
"other_eth",
"SDDSRVYR",
"BMXBMI",
"SES_LEVEL",
"RIDAGEYR",
"LBXCOT",
"IRON_mg",
]
retorno = clarite.analyze.interaction_study(
data=nested_table,
outcomes="LBXHGB",
interactions=[(e1, e2)],
covariates=list_covariant,
)

assert retorno == retorno
# assert (grouped_bonf == python_result_nobeta["LRT_pvalue_bonferroni"]).all()
# assert (grouped_fdr == python_result_nobeta["LRT_pvalue_fdr"]).all()
1 change: 1 addition & 0 deletions tests/on_demand/test_debug_pvalue.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def test_interactions_debug():
interactions=[(e1, e2)],
covariates=list_covariant,
report_betas=True,
min_n=8000,
)

print(df_inter)
Expand Down
Binary file modified tests/py_test_output/top_results_nhanesreal.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/py_test_output/top_results_nhanesreal_no_cutoff.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified tests/py_test_output/top_results_nhanessmall.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit ee1e1dc

Please sign in to comment.