Skip to content

Commit

Permalink
Merge pull request #260 from mims-harvard/package_release
Browse files Browse the repository at this point in the history
Change pinnacle and pentelute dataset names to more appropriate ones
  • Loading branch information
amva13 authored May 13, 2024
2 parents 4c79d17 + 91ee851 commit e115138
Show file tree
Hide file tree
Showing 10 changed files with 112 additions and 108 deletions.
8 changes: 4 additions & 4 deletions tdc/benchmark_group/scdti_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,23 @@ def __init__(self, path="./data", file_format="csv"):
# super().__init__(name="SCDTI_Group", path=path)
self.name = "SCDTI_Group"
self.path = os.path.join(path, self.name)
# self.datasets = ["pinnacle_dti"]
self.dataset_names = ["pinnacle_dti"]
# self.datasets = ["opentargets_dti"]
self.dataset_names = ["opentargets_dti"]
self.file_format = file_format
self.split = None

def get_train_valid_split(self):
"""parameters included for compatibility. this benchmark has a fixed train/test split."""
from ..resource.dataloader import DataLoader
if self.split is None:
dl = DataLoader(name="pinnacle_dti")
dl = DataLoader(name="opentargets_dti")
self.split = dl.get_split()
return self.split["train"], self.split["dev"]

def get_test(self):
from ..resource.dataloader import DataLoader
if self.split is None:
dl = DataLoader(name="pinnacle_dti")
dl = DataLoader(name="opentargets_dti")
self.split = dl.get_split()
return self.split["test"]

Expand Down
2 changes: 1 addition & 1 deletion tdc/dataset_configs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .config import DatasetConfig
from .pentelute_mdm2_ace2_12ca5_config import PenteluteProteinPeptideConfig
from .brown_mdm2_ace2_12ca5_config import BrownProteinPeptideConfig
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from ..feature_generators.protein_feature_generator import ProteinFeatureGenerator


class PenteluteProteinPeptideConfig(DatasetConfig):
"""Configuration for the pentelute-protein-peptide datasets"""
class BrownProteinPeptideConfig(DatasetConfig):
"""Configuration for the brown-protein-peptide datasets"""

def __init__(self):
super(PenteluteProteinPeptideConfig, self).__init__(
dataset_name="pentelute_mdm2_ace2_12ca5",
super(BrownProteinPeptideConfig, self).__init__(
dataset_name="brown_mdm2_ace2_12ca5",
data_processing_class=ProteinFeatureGenerator,
functions_to_run=[
"autofill_identifier", "create_range", "insert_protein_sequence"
Expand Down
8 changes: 4 additions & 4 deletions tdc/dataset_configs/config_map.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .pentelute_mdm2_ace2_12ca5_config import PenteluteProteinPeptideConfig
from .pinnacle_dti import PinnacleDTI
from .brown_mdm2_ace2_12ca5_config import BrownProteinPeptideConfig
from .opentargets_dti import OpentargetsDTI
from .scperturb_config import SCPerturb, SCPerturb_Gene

scperturb_datasets = [
Expand All @@ -23,9 +23,9 @@ class ConfigMap(dict):
"""

def __init__(self):
self["pentelute_mdm2_ace2_12ca5"] = PenteluteProteinPeptideConfig
self["brown_mdm2_ace2_12ca5"] = BrownProteinPeptideConfig
for ds in scperturb_datasets:
self[ds] = SCPerturb
for ds in scperturb_gene_datasets:
self[ds] = SCPerturb_Gene
self["pinnacle_dti"] = PinnacleDTI
self["opentargets_dti"] = OpentargetsDTI
63 changes: 63 additions & 0 deletions tdc/dataset_configs/opentargets_dti.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from .config import ResourceConfig
from ..feature_generators.resource import ResourceFeatureGenerator


class OpentargetsDTI(ResourceConfig):
"""Configuration for opentargets drug-target-identification datasets"""

def __init__(self):
super(OpentargetsDTI, self).__init__(
ResourceFeatureGenerator(),
keys=[
"opentargets_ra_data_splits", "opentargets_ibd_data_splits",
"df"
],
loader_functions=["split", "split", "concat"],
loader_args=[{
"dataset": ("self", "opentargets_ra_drug_evidence"),
"column_name":
"targetId_genename",
"pos_train": ("self", [
"opentargets_ra_data_splits", "splits", "pos_train_names"
]),
"pos_dev":
None,
"pos_test":
("self",
["opentargets_ra_data_splits", "splits",
"pos_test_names"]),
"neg_train": ("self", [
"opentargets_ra_data_splits", "splits", "neg_train_names"
]),
"neg_dev":
None,
"neg_test":
("self",
["opentargets_ra_data_splits", "splits",
"neg_test_names"]),
}, {
"dataset": ("self", "opentargets_ibd_drug_evidence"),
"column_name":
"targetId_genename",
"pos_train": ("self", [
"opentargets_ibd_data_splits", "splits", "pos_train_names"
]),
"pos_dev":
None,
"pos_test": ("self", [
"opentargets_ibd_data_splits", "splits", "pos_test_names"
]),
"neg_train": ("self", [
"opentargets_ibd_data_splits", "splits", "neg_train_names"
]),
"neg_dev":
None,
"neg_test": ("self", [
"opentargets_ibd_data_splits", "splits", "neg_test_names"
])
}, {
"ds_list": [
"opentargets_ibd_data_splits", "opentargets_ra_data_splits"
],
"axis": 0
}])
58 changes: 0 additions & 58 deletions tdc/dataset_configs/pinnacle_dti.py

This file was deleted.

60 changes: 30 additions & 30 deletions tdc/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@

trial_outcome_dataset_names = ['phase1', 'phase2', 'phase3']

proteinpeptide_dataset_names = ['pentelute_mdm2_ace2_12ca5']
proteinpeptide_dataset_names = ['brown_mdm2_ace2_12ca5']

cellxgene_dataset_names = [
"scperturb_drug_AissaBenevolenskaya2021",
Expand All @@ -193,29 +193,29 @@
]

resource_dataset_names = [
"pinnacle_ra_data_splits",
"pinnacle_ibd_data_splits",
"pinnacle_ra_drug_evidence",
"pinnacle_ibd_drug_evidence",
"pinnacle_ra_data_splits_idx",
"pinnacle_ibd_data_splits_idx",
"opentargets_ra_data_splits",
"opentargets_ibd_data_splits",
"opentargets_ra_drug_evidence",
"opentargets_ibd_drug_evidence",
"opentargets_ra_data_splits_idx",
"opentargets_ibd_data_splits_idx",
]

resources = {
"pinnacle_dti": {
"opentargets_dti": {
"splits": [
"pinnacle_ra_data_splits",
"pinnacle_ibd_data_splits",
"opentargets_ra_data_splits",
"opentargets_ibd_data_splits",
],
"datasets": [
"pinnacle_ra_drug_evidence",
"pinnacle_ibd_drug_evidence",
"opentargets_ra_drug_evidence",
"opentargets_ibd_drug_evidence",
],
"all": [
"pinnacle_ra_data_splits",
"pinnacle_ibd_data_splits",
"pinnacle_ra_drug_evidence",
"pinnacle_ibd_drug_evidence",
"opentargets_ra_data_splits",
"opentargets_ibd_data_splits",
"opentargets_ra_drug_evidence",
"opentargets_ibd_drug_evidence",
],
}
}
Expand Down Expand Up @@ -770,7 +770,7 @@ def get_task2category():
"phase1": "tab",
"phase2": "tab",
"phase3": "tab",
"pentelute_mdm2_ace2_12ca5": "xlsx",
"brown_mdm2_ace2_12ca5": "xlsx",
"scperturb_drug_AissaBenevolenskaya2021": "h5ad",
"scperturb_drug_SrivatsanTrapnell2020_sciplex2": "h5ad",
"scperturb_drug_SrivatsanTrapnell2020_sciplex3": "h5ad",
Expand All @@ -779,12 +779,12 @@ def get_task2category():
"scperturb_gene_NormanWeissman2019": "h5ad",
"scperturb_gene_ReplogleWeissman2022_rpe1": "h5ad",
"scperturb_gene_ReplogleWeissman2022_k562_essential": "h5ad",
"pinnacle_ra_data_splits": "json",
"pinnacle_ra_data_splits_idx": "json",
"pinnacle_ibd_data_splits": "json",
"pinnacle_ibd_data_splits_idx": "json",
"pinnacle_ra_drug_evidence": "tab",
"pinnacle_ibd_drug_evidence": "tab",
"opentargets_ra_data_splits": "json",
"opentargets_ra_data_splits_idx": "json",
"opentargets_ibd_data_splits": "json",
"opentargets_ibd_data_splits_idx": "json",
"opentargets_ra_drug_evidence": "tab",
"opentargets_ibd_drug_evidence": "tab",
}

name2id = {
Expand Down Expand Up @@ -899,7 +899,7 @@ def get_task2category():
"phase1": 7331305,
"phase2": 7331306,
"phase3": 7331307,
"pentelute_mdm2_ace2_12ca5": 9649623,
"brown_mdm2_ace2_12ca5": 9649623,
"scperturb_drug_AissaBenevolenskaya2021": 9845396,
"scperturb_drug_SrivatsanTrapnell2020_sciplex2": 9845394,
"scperturb_drug_SrivatsanTrapnell2020_sciplex3": 9845397,
Expand All @@ -908,12 +908,12 @@ def get_task2category():
"scperturb_gene_NormanWeissman2019": 10133995,
"scperturb_gene_ReplogleWeissman2022_rpe1": 10133996,
"scperturb_gene_ReplogleWeissman2022_k562_essential": 10134031,
"pinnacle_ra_data_splits": 10141152,
"pinnacle_ibd_data_splits": 10141151,
"pinnacle_ra_data_splits_idx": 10143574,
"pinnacle_ibd_data_splits_idx": 10143573,
"pinnacle_ra_drug_evidence": 10141153,
"pinnacle_ibd_drug_evidence": 10141154,
"opentargets_ra_data_splits": 10141152,
"opentargets_ibd_data_splits": 10141151,
"opentargets_ra_data_splits_idx": 10143574,
"opentargets_ibd_data_splits_idx": 10143573,
"opentargets_ra_drug_evidence": 10141153,
"opentargets_ibd_drug_evidence": 10141154,
}

oracle2type = {
Expand Down
1 change: 0 additions & 1 deletion tdc/multi_pred/proteinpeptide.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import sys

from ..utils import print_sys
from ..dataset_configs.pentelute_mdm2_ace2_12ca5_config import PenteluteProteinPeptideConfig
from . import bi_pred_dataset
from ..metadata import dataset_names
from ..dataset_configs.config_map import ConfigMap
Expand Down
4 changes: 2 additions & 2 deletions tdc/test/test_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,15 @@ def test_ADME_evaluate_many(self):
def test_SCDTI_benchmark(self):
from tdc.resource.dataloader import DataLoader

data = DataLoader(name="pinnacle_dti")
data = DataLoader(name="opentargets_dti")
group = scdti_group.SCDTIGroup()
train, val = group.get_train_valid_split()
assert len(val) == 0 # this benchmark has no validation set
# test simple preds
y_true = group.get_test()["Y"]
results = group.evaluate(y_true)
assert results[-1] == 1.0 # should be perfect F1 score
# assert it matches the PINNACLE official test scores
# assert it matches the opentargets official test scores
tst = data.get_split()["test"]["Y"]
results = group.evaluate(tst)
assert results[-1] == 1.0
Expand Down
8 changes: 4 additions & 4 deletions tdc/test/test_dataloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,13 @@ def test_cellxgene_list(self):
assert len(df) > 0
print(df.head())

def test_pentelute(self):
def test_brown(self):
# TODO: factor out into specialized test suites for individual datasets
# this test serves as an integration test of the data processing, data configs, and existing tdc pipeline. leave here for now.
from tdc.multi_pred import ProteinPeptide
data = ProteinPeptide(name="pentelute_mdm2_ace2_12ca5")
data = ProteinPeptide(name="brown_mdm2_ace2_12ca5")
assert "protein_or_rna_sequence" in data.get_data(
).columns # pentelute protein<>peptide dataset uses a data config inserting this column
).columns # brown protein<>peptide dataset uses a data config inserting this column
data.get_split()

@unittest.skip(
Expand All @@ -96,7 +96,7 @@ def test_resource_dataverse_dataloader(self):
import pandas as pd
from tdc.resource.dataloader import DataLoader

data = DataLoader(name="pinnacle_dti")
data = DataLoader(name="opentargets_dti")
df = data.get_data()
assert "Y" in df.columns
split = data.get_split()
Expand Down

0 comments on commit e115138

Please sign in to comment.