From b087f7df22c381962442c3cd302b1da4813fb577 Mon Sep 17 00:00:00 2001 From: Alejandro Velez Date: Mon, 13 May 2024 11:39:36 -0400 Subject: [PATCH 1/3] change pinnacle dataset names to the appropriate opentargets --- tdc/benchmark_group/scdti_group.py | 8 +-- tdc/dataset_configs/config_map.py | 4 +- .../{pinnacle_dti.py => opentargets_dti.py} | 30 +++++------ tdc/metadata.py | 54 +++++++++---------- tdc/test/test_benchmark.py | 4 +- tdc/test/test_dataloaders.py | 2 +- 6 files changed, 51 insertions(+), 51 deletions(-) rename tdc/dataset_configs/{pinnacle_dti.py => opentargets_dti.py} (51%) diff --git a/tdc/benchmark_group/scdti_group.py b/tdc/benchmark_group/scdti_group.py index eda47efb..8abbc73b 100644 --- a/tdc/benchmark_group/scdti_group.py +++ b/tdc/benchmark_group/scdti_group.py @@ -18,8 +18,8 @@ def __init__(self, path="./data", file_format="csv"): # super().__init__(name="SCDTI_Group", path=path) self.name = "SCDTI_Group" self.path = os.path.join(path, self.name) - # self.datasets = ["pinnacle_dti"] - self.dataset_names = ["pinnacle_dti"] + # self.datasets = ["opentargets_dti"] + self.dataset_names = ["opentargets_dti"] self.file_format = file_format self.split = None @@ -27,14 +27,14 @@ def get_train_valid_split(self): """parameters included for compatibility. this benchmark has a fixed train/test split.""" from ..resource.dataloader import DataLoader if self.split is None: - dl = DataLoader(name="pinnacle_dti") + dl = DataLoader(name="opentargets_dti") self.split = dl.get_split() return self.split["train"], self.split["dev"] def get_test(self): from ..resource.dataloader import DataLoader if self.split is None: - dl = DataLoader(name="pinnacle_dti") + dl = DataLoader(name="opentargets_dti") self.split = dl.get_split() return self.split["test"] diff --git a/tdc/dataset_configs/config_map.py b/tdc/dataset_configs/config_map.py index 9bc7e4ea..2b5872c4 100644 --- a/tdc/dataset_configs/config_map.py +++ b/tdc/dataset_configs/config_map.py @@ -1,5 +1,5 @@ from .pentelute_mdm2_ace2_12ca5_config import PenteluteProteinPeptideConfig -from .pinnacle_dti import PinnacleDTI +from .opentargets_dti import OpentargetsDTI from .scperturb_config import SCPerturb, SCPerturb_Gene scperturb_datasets = [ @@ -28,4 +28,4 @@ def __init__(self): self[ds] = SCPerturb for ds in scperturb_gene_datasets: self[ds] = SCPerturb_Gene - self["pinnacle_dti"] = PinnacleDTI + self["opentargets_dti"] = OpentargetsDTI diff --git a/tdc/dataset_configs/pinnacle_dti.py b/tdc/dataset_configs/opentargets_dti.py similarity index 51% rename from tdc/dataset_configs/pinnacle_dti.py rename to tdc/dataset_configs/opentargets_dti.py index e5604d78..0ebe7240 100644 --- a/tdc/dataset_configs/pinnacle_dti.py +++ b/tdc/dataset_configs/opentargets_dti.py @@ -2,57 +2,57 @@ from ..feature_generators.resource import ResourceFeatureGenerator -class PinnacleDTI(ResourceConfig): - """Configuration for PINNACLE drug-target-identification datasets""" +class OpentargetsDTI(ResourceConfig): + """Configuration for opentargets drug-target-identification datasets""" def __init__(self): - super(PinnacleDTI, self).__init__( + super(OpentargetsDTI, self).__init__( ResourceFeatureGenerator(), - keys=["pinnacle_ra_data_splits", "pinnacle_ibd_data_splits", "df"], + keys=["opentargets_ra_data_splits", "opentargets_ibd_data_splits", "df"], loader_functions=["split", "split", "concat"], loader_args=[{ - "dataset": ("self", "pinnacle_ra_drug_evidence"), + "dataset": ("self", "opentargets_ra_drug_evidence"), "column_name": "targetId_genename", "pos_train": ("self", - ["pinnacle_ra_data_splits", "splits", "pos_train_names"]), + ["opentargets_ra_data_splits", "splits", "pos_train_names"]), "pos_dev": None, "pos_test": ("self", - ["pinnacle_ra_data_splits", "splits", "pos_test_names"]), + ["opentargets_ra_data_splits", "splits", "pos_test_names"]), "neg_train": ("self", - ["pinnacle_ra_data_splits", "splits", "neg_train_names"]), + ["opentargets_ra_data_splits", "splits", "neg_train_names"]), "neg_dev": None, "neg_test": ("self", - ["pinnacle_ra_data_splits", "splits", "neg_test_names"]), + ["opentargets_ra_data_splits", "splits", "neg_test_names"]), }, { - "dataset": ("self", "pinnacle_ibd_drug_evidence"), + "dataset": ("self", "opentargets_ibd_drug_evidence"), "column_name": "targetId_genename", "pos_train": ("self", - ["pinnacle_ibd_data_splits", "splits", "pos_train_names"]), + ["opentargets_ibd_data_splits", "splits", "pos_train_names"]), "pos_dev": None, "pos_test": ("self", - ["pinnacle_ibd_data_splits", "splits", "pos_test_names"]), + ["opentargets_ibd_data_splits", "splits", "pos_test_names"]), "neg_train": ("self", - ["pinnacle_ibd_data_splits", "splits", "neg_train_names"]), + ["opentargets_ibd_data_splits", "splits", "neg_train_names"]), "neg_dev": None, "neg_test": ("self", - ["pinnacle_ibd_data_splits", "splits", "neg_test_names"]) + ["opentargets_ibd_data_splits", "splits", "neg_test_names"]) }, { "ds_list": [ - "pinnacle_ibd_data_splits", "pinnacle_ra_data_splits" + "opentargets_ibd_data_splits", "opentargets_ra_data_splits" ], "axis": 0 }]) diff --git a/tdc/metadata.py b/tdc/metadata.py index e47ca9b9..0b51b8b5 100644 --- a/tdc/metadata.py +++ b/tdc/metadata.py @@ -193,29 +193,29 @@ ] resource_dataset_names = [ - "pinnacle_ra_data_splits", - "pinnacle_ibd_data_splits", - "pinnacle_ra_drug_evidence", - "pinnacle_ibd_drug_evidence", - "pinnacle_ra_data_splits_idx", - "pinnacle_ibd_data_splits_idx", + "opentargets_ra_data_splits", + "opentargets_ibd_data_splits", + "opentargets_ra_drug_evidence", + "opentargets_ibd_drug_evidence", + "opentargets_ra_data_splits_idx", + "opentargets_ibd_data_splits_idx", ] resources = { - "pinnacle_dti": { + "opentargets_dti": { "splits": [ - "pinnacle_ra_data_splits", - "pinnacle_ibd_data_splits", + "opentargets_ra_data_splits", + "opentargets_ibd_data_splits", ], "datasets": [ - "pinnacle_ra_drug_evidence", - "pinnacle_ibd_drug_evidence", + "opentargets_ra_drug_evidence", + "opentargets_ibd_drug_evidence", ], "all": [ - "pinnacle_ra_data_splits", - "pinnacle_ibd_data_splits", - "pinnacle_ra_drug_evidence", - "pinnacle_ibd_drug_evidence", + "opentargets_ra_data_splits", + "opentargets_ibd_data_splits", + "opentargets_ra_drug_evidence", + "opentargets_ibd_drug_evidence", ], } } @@ -779,12 +779,12 @@ def get_task2category(): "scperturb_gene_NormanWeissman2019": "h5ad", "scperturb_gene_ReplogleWeissman2022_rpe1": "h5ad", "scperturb_gene_ReplogleWeissman2022_k562_essential": "h5ad", - "pinnacle_ra_data_splits": "json", - "pinnacle_ra_data_splits_idx": "json", - "pinnacle_ibd_data_splits": "json", - "pinnacle_ibd_data_splits_idx": "json", - "pinnacle_ra_drug_evidence": "tab", - "pinnacle_ibd_drug_evidence": "tab", + "opentargets_ra_data_splits": "json", + "opentargets_ra_data_splits_idx": "json", + "opentargets_ibd_data_splits": "json", + "opentargets_ibd_data_splits_idx": "json", + "opentargets_ra_drug_evidence": "tab", + "opentargets_ibd_drug_evidence": "tab", } name2id = { @@ -908,12 +908,12 @@ def get_task2category(): "scperturb_gene_NormanWeissman2019": 10133995, "scperturb_gene_ReplogleWeissman2022_rpe1": 10133996, "scperturb_gene_ReplogleWeissman2022_k562_essential": 10134031, - "pinnacle_ra_data_splits": 10141152, - "pinnacle_ibd_data_splits": 10141151, - "pinnacle_ra_data_splits_idx": 10143574, - "pinnacle_ibd_data_splits_idx": 10143573, - "pinnacle_ra_drug_evidence": 10141153, - "pinnacle_ibd_drug_evidence": 10141154, + "opentargets_ra_data_splits": 10141152, + "opentargets_ibd_data_splits": 10141151, + "opentargets_ra_data_splits_idx": 10143574, + "opentargets_ibd_data_splits_idx": 10143573, + "opentargets_ra_drug_evidence": 10141153, + "opentargets_ibd_drug_evidence": 10141154, } oracle2type = { diff --git a/tdc/test/test_benchmark.py b/tdc/test/test_benchmark.py index 6a89c299..2b448493 100644 --- a/tdc/test/test_benchmark.py +++ b/tdc/test/test_benchmark.py @@ -65,7 +65,7 @@ def test_ADME_evaluate_many(self): def test_SCDTI_benchmark(self): from tdc.resource.dataloader import DataLoader - data = DataLoader(name="pinnacle_dti") + data = DataLoader(name="opentargets_dti") group = scdti_group.SCDTIGroup() train, val = group.get_train_valid_split() assert len(val) == 0 # this benchmark has no validation set @@ -73,7 +73,7 @@ def test_SCDTI_benchmark(self): y_true = group.get_test()["Y"] results = group.evaluate(y_true) assert results[-1] == 1.0 # should be perfect F1 score - # assert it matches the PINNACLE official test scores + # assert it matches the opentargets official test scores tst = data.get_split()["test"]["Y"] results = group.evaluate(tst) assert results[-1] == 1.0 diff --git a/tdc/test/test_dataloaders.py b/tdc/test/test_dataloaders.py index 220d04fa..8bb27fd5 100644 --- a/tdc/test/test_dataloaders.py +++ b/tdc/test/test_dataloaders.py @@ -96,7 +96,7 @@ def test_resource_dataverse_dataloader(self): import pandas as pd from tdc.resource.dataloader import DataLoader - data = DataLoader(name="pinnacle_dti") + data = DataLoader(name="opentargets_dti") df = data.get_data() assert "Y" in df.columns split = data.get_split() From 37c14cfb19e0d2334e9927bf69904a61675d5a87 Mon Sep 17 00:00:00 2001 From: Alejandro Velez Date: Mon, 13 May 2024 11:48:16 -0400 Subject: [PATCH 2/3] fix naming for pentelute dataset -> should be author Joseph Brown --- tdc/dataset_configs/__init__.py | 2 +- ...e2_12ca5_config.py => brown_mdm2_ace2_12ca5_config.py} | 8 ++++---- tdc/dataset_configs/config_map.py | 4 ++-- tdc/metadata.py | 6 +++--- tdc/multi_pred/proteinpeptide.py | 1 - tdc/test/test_dataloaders.py | 6 +++--- 6 files changed, 13 insertions(+), 14 deletions(-) rename tdc/dataset_configs/{pentelute_mdm2_ace2_12ca5_config.py => brown_mdm2_ace2_12ca5_config.py} (78%) diff --git a/tdc/dataset_configs/__init__.py b/tdc/dataset_configs/__init__.py index 120c83d8..937a8d43 100644 --- a/tdc/dataset_configs/__init__.py +++ b/tdc/dataset_configs/__init__.py @@ -1,2 +1,2 @@ from .config import DatasetConfig -from .pentelute_mdm2_ace2_12ca5_config import PenteluteProteinPeptideConfig \ No newline at end of file +from .brown_mdm2_ace2_12ca5_config import BrownProteinPeptideConfig \ No newline at end of file diff --git a/tdc/dataset_configs/pentelute_mdm2_ace2_12ca5_config.py b/tdc/dataset_configs/brown_mdm2_ace2_12ca5_config.py similarity index 78% rename from tdc/dataset_configs/pentelute_mdm2_ace2_12ca5_config.py rename to tdc/dataset_configs/brown_mdm2_ace2_12ca5_config.py index 592cce15..b0960a9f 100644 --- a/tdc/dataset_configs/pentelute_mdm2_ace2_12ca5_config.py +++ b/tdc/dataset_configs/brown_mdm2_ace2_12ca5_config.py @@ -2,12 +2,12 @@ from ..feature_generators.protein_feature_generator import ProteinFeatureGenerator -class PenteluteProteinPeptideConfig(DatasetConfig): - """Configuration for the pentelute-protein-peptide datasets""" +class BrownProteinPeptideConfig(DatasetConfig): + """Configuration for the brown-protein-peptide datasets""" def __init__(self): - super(PenteluteProteinPeptideConfig, self).__init__( - dataset_name="pentelute_mdm2_ace2_12ca5", + super(BrownProteinPeptideConfig, self).__init__( + dataset_name="brown_mdm2_ace2_12ca5", data_processing_class=ProteinFeatureGenerator, functions_to_run=[ "autofill_identifier", "create_range", "insert_protein_sequence" diff --git a/tdc/dataset_configs/config_map.py b/tdc/dataset_configs/config_map.py index 2b5872c4..22462c04 100644 --- a/tdc/dataset_configs/config_map.py +++ b/tdc/dataset_configs/config_map.py @@ -1,4 +1,4 @@ -from .pentelute_mdm2_ace2_12ca5_config import PenteluteProteinPeptideConfig +from .brown_mdm2_ace2_12ca5_config import BrownProteinPeptideConfig from .opentargets_dti import OpentargetsDTI from .scperturb_config import SCPerturb, SCPerturb_Gene @@ -23,7 +23,7 @@ class ConfigMap(dict): """ def __init__(self): - self["pentelute_mdm2_ace2_12ca5"] = PenteluteProteinPeptideConfig + self["brown_mdm2_ace2_12ca5"] = BrownProteinPeptideConfig for ds in scperturb_datasets: self[ds] = SCPerturb for ds in scperturb_gene_datasets: diff --git a/tdc/metadata.py b/tdc/metadata.py index 0b51b8b5..da34214e 100644 --- a/tdc/metadata.py +++ b/tdc/metadata.py @@ -179,7 +179,7 @@ trial_outcome_dataset_names = ['phase1', 'phase2', 'phase3'] -proteinpeptide_dataset_names = ['pentelute_mdm2_ace2_12ca5'] +proteinpeptide_dataset_names = ['brown_mdm2_ace2_12ca5'] cellxgene_dataset_names = [ "scperturb_drug_AissaBenevolenskaya2021", @@ -770,7 +770,7 @@ def get_task2category(): "phase1": "tab", "phase2": "tab", "phase3": "tab", - "pentelute_mdm2_ace2_12ca5": "xlsx", + "brown_mdm2_ace2_12ca5": "xlsx", "scperturb_drug_AissaBenevolenskaya2021": "h5ad", "scperturb_drug_SrivatsanTrapnell2020_sciplex2": "h5ad", "scperturb_drug_SrivatsanTrapnell2020_sciplex3": "h5ad", @@ -899,7 +899,7 @@ def get_task2category(): "phase1": 7331305, "phase2": 7331306, "phase3": 7331307, - "pentelute_mdm2_ace2_12ca5": 9649623, + "brown_mdm2_ace2_12ca5": 9649623, "scperturb_drug_AissaBenevolenskaya2021": 9845396, "scperturb_drug_SrivatsanTrapnell2020_sciplex2": 9845394, "scperturb_drug_SrivatsanTrapnell2020_sciplex3": 9845397, diff --git a/tdc/multi_pred/proteinpeptide.py b/tdc/multi_pred/proteinpeptide.py index a0580f12..87988034 100644 --- a/tdc/multi_pred/proteinpeptide.py +++ b/tdc/multi_pred/proteinpeptide.py @@ -8,7 +8,6 @@ import sys from ..utils import print_sys -from ..dataset_configs.pentelute_mdm2_ace2_12ca5_config import PenteluteProteinPeptideConfig from . import bi_pred_dataset from ..metadata import dataset_names from ..dataset_configs.config_map import ConfigMap diff --git a/tdc/test/test_dataloaders.py b/tdc/test/test_dataloaders.py index 8bb27fd5..e8c3906e 100644 --- a/tdc/test/test_dataloaders.py +++ b/tdc/test/test_dataloaders.py @@ -65,13 +65,13 @@ def test_cellxgene_list(self): assert len(df) > 0 print(df.head()) - def test_pentelute(self): + def test_brown(self): # TODO: factor out into specialized test suites for individual datasets # this test serves as an integration test of the data processing, data configs, and existing tdc pipeline. leave here for now. from tdc.multi_pred import ProteinPeptide - data = ProteinPeptide(name="pentelute_mdm2_ace2_12ca5") + data = ProteinPeptide(name="brown_mdm2_ace2_12ca5") assert "protein_or_rna_sequence" in data.get_data( - ).columns # pentelute protein<>peptide dataset uses a data config inserting this column + ).columns # brown protein<>peptide dataset uses a data config inserting this column data.get_split() @unittest.skip( From 91ee851f92ba0031ee911871c4c67bbfd152330e Mon Sep 17 00:00:00 2001 From: Alejandro Velez Date: Mon, 13 May 2024 11:51:58 -0400 Subject: [PATCH 3/3] mend --- tdc/dataset_configs/opentargets_dti.py | 47 ++++++++++++++------------ 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/tdc/dataset_configs/opentargets_dti.py b/tdc/dataset_configs/opentargets_dti.py index 0ebe7240..b748d194 100644 --- a/tdc/dataset_configs/opentargets_dti.py +++ b/tdc/dataset_configs/opentargets_dti.py @@ -8,48 +8,53 @@ class OpentargetsDTI(ResourceConfig): def __init__(self): super(OpentargetsDTI, self).__init__( ResourceFeatureGenerator(), - keys=["opentargets_ra_data_splits", "opentargets_ibd_data_splits", "df"], + keys=[ + "opentargets_ra_data_splits", "opentargets_ibd_data_splits", + "df" + ], loader_functions=["split", "split", "concat"], loader_args=[{ "dataset": ("self", "opentargets_ra_drug_evidence"), "column_name": "targetId_genename", - "pos_train": - ("self", - ["opentargets_ra_data_splits", "splits", "pos_train_names"]), + "pos_train": ("self", [ + "opentargets_ra_data_splits", "splits", "pos_train_names" + ]), "pos_dev": None, "pos_test": ("self", - ["opentargets_ra_data_splits", "splits", "pos_test_names"]), - "neg_train": - ("self", - ["opentargets_ra_data_splits", "splits", "neg_train_names"]), + ["opentargets_ra_data_splits", "splits", + "pos_test_names"]), + "neg_train": ("self", [ + "opentargets_ra_data_splits", "splits", "neg_train_names" + ]), "neg_dev": None, "neg_test": ("self", - ["opentargets_ra_data_splits", "splits", "neg_test_names"]), + ["opentargets_ra_data_splits", "splits", + "neg_test_names"]), }, { "dataset": ("self", "opentargets_ibd_drug_evidence"), "column_name": "targetId_genename", - "pos_train": - ("self", - ["opentargets_ibd_data_splits", "splits", "pos_train_names"]), + "pos_train": ("self", [ + "opentargets_ibd_data_splits", "splits", "pos_train_names" + ]), "pos_dev": None, - "pos_test": - ("self", - ["opentargets_ibd_data_splits", "splits", "pos_test_names"]), - "neg_train": - ("self", - ["opentargets_ibd_data_splits", "splits", "neg_train_names"]), + "pos_test": ("self", [ + "opentargets_ibd_data_splits", "splits", "pos_test_names" + ]), + "neg_train": ("self", [ + "opentargets_ibd_data_splits", "splits", "neg_train_names" + ]), "neg_dev": None, - "neg_test": - ("self", - ["opentargets_ibd_data_splits", "splits", "neg_test_names"]) + "neg_test": ("self", [ + "opentargets_ibd_data_splits", "splits", "neg_test_names" + ]) }, { "ds_list": [ "opentargets_ibd_data_splits", "opentargets_ra_data_splits"