-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Building action space. 'Add datasets for various drug-target interact…
…ion tasks'
- Loading branch information
Showing
7 changed files
with
304 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
""" | ||
map with dataset labels and descriptions | ||
""" | ||
from .entity import Entity | ||
from ...resource.dataloader import DataLoader # for pinnacle scdti | ||
from ...multi_pred.dti import DTI # BindingDB | ||
|
||
import json | ||
|
||
class Dataset(Entity): | ||
|
||
def __init__(self, **kwargs): | ||
super().__init__(**kwargs) | ||
self._description = { | ||
"name": "the name of the dataset", | ||
"description": "a description of the dataset", | ||
"task": "The TDC task associated with the dataset", | ||
"ml_task": "The ML Task (i.e., Binary Classification, Regression, etc.) the dataset is created for", | ||
"statistics": "the statistics associated with the dataset", | ||
"split": "the available data splits for the dataset", | ||
"schema": "a mapping of column names to a dictionary containing the type of the column and a description for the column" | ||
} | ||
self._functions = { | ||
"get": "get the dataset as a pandas dataframe", | ||
"get_task": "get the TDC task associated with the dataset", | ||
"get_schema": "get a JSON for the dataset schema" | ||
} | ||
|
||
def get(self): | ||
return self["lambda_f"]() | ||
|
||
def get_task(self): | ||
return self["task"] | ||
|
||
def get_schema(self): | ||
return json.dumps(self["schema"], indent=4) | ||
|
||
base = Dataset() # every entity has a base instantiation so agent can just obtain description and functions | ||
|
||
# Li, Michelle, et al. dataset entity | ||
pinnacle_opentargets = Dataset( | ||
name = "Li, Michelle, et al.", | ||
task = "scDTI: The goal is to train a model for predicting the probability that a protein is a candidate therapeutic target in a specific \ | ||
cell type. The model learns an estimator for a function of a protein target and a cell-type-specific biological context as input, and \ | ||
the model is tasked to predict the probability the candidate protein is a therapeutic target in that cell type.", | ||
description = "To curate target information for a therapeutic area, we examine the drugs indicated for the therapeutic area of interest and\ | ||
its descendants. The two therapeutic areas examined are rheumatoid arthritis (RA) and inflammatory bowel disease. Positive examples (i.e.,\ | ||
where the label y = 1) are proteins targeted by drugs that have at least completed phase 2 of clinical trials for treating a specific \ | ||
therapeutic area. As such, a protein is a promising candidate if a compound that targets the protein is safe for humans and effective \ | ||
for treating the disease. We retain positive training examples activated in at least one cell type-specific protein interaction network.\ | ||
We define negative examples (i.e., where the label y = 0) as druggable proteins that do not have any known association with the \ | ||
therapeutic area of interest according to Open Targets. A protein is deemed druggable if targeted by at least one existing drug.\ | ||
We extract drugs and their nominal targets from Drugbank. We retain negative training examples activated in at least one cell\ | ||
type-specific protein interaction network.", | ||
ml_task = "Classification. Given the protein and cell-context, predict whether the protein is a therapeutic target.", | ||
statistics = "The final number of positive (negative) samples for RA and IBD were 152 (1,465) and 114 (1,377), respectively. In PINNACLE, \ | ||
this dataset was augmented to include 156 cell types.", | ||
split = "Cold Protein Split. We split the dataset such that about 80%% of the proteins are in the training set, about 10%% of the proteins are in\ | ||
the validation set, and about 10%% of the proteins are in the test set. The data splits are consistent for each cell type context to avoid data\ | ||
leakage.", | ||
schema = { | ||
"diseaseId": { | ||
"type": "string", | ||
"description": "Disease ID. The two therapeutic areas examined are rheumatoid arthritis (RA) and inflammatory bowel disease. For \ | ||
rheumatoid arthritis, we collected therapeutic data (i.e., targets of drugs indicated for the therapeutic area) from OpenTargets \ | ||
for rheumatoid arthritis (EFO 0000685), ankylosing spondylitis (EFO 0003898), and psoriatic arthritis (EFO 0003778). \ | ||
For inflammatory bowel disease, we collected therapeutic data for ulcerative colitis (EFO 0000729), collagenous colitis \ | ||
(EFO 1001293), colitis (EFO 0003872), proctitis (EFO 0005628), Crohn’s colitis (EFO 0005622), lymphocytic colitis (EFO 1001294),\ | ||
Crohn’s disease (EFO 0000384), microscopic colitis (EFO 1001295), inflammatory bowel disease (EFO 0003767), \ | ||
appendicitis (EFO 0007149), ulcerative proctosigmoiditis (EFO 1001223), and small bowel Crohn’s disease (EFO 0005629)." | ||
}, | ||
"targetId_genename": { | ||
"type": "string", | ||
"description": "Protein Target" | ||
}, | ||
"Y": { | ||
"type": "int", | ||
"description": "Binary Indicator. Y=1 if the protein is a viable drug target for this disease and Y=0 otherwise." | ||
} | ||
}, | ||
lambda_f = lambda: DataLoader("opentargets_dti").get_data() | ||
) | ||
|
||
binding_db_kd = Dataset( | ||
name = "BindingDB_Kd", | ||
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ | ||
structural information and protein amino acid sequence.", | ||
description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \ | ||
considered to be drug-targets with small, drug-like molecules. This is the Kd dataset.", | ||
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.", | ||
statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.", | ||
split = "Random Split, Cold Drug Split, or Cold Protein Split", | ||
schema = { | ||
"Drug": { | ||
"type": "string", | ||
"description": "Drug SMILES string" | ||
}, | ||
"Target": { | ||
"type": "string", | ||
"description": "Protein Target Amino Acid Sequence" | ||
}, | ||
"Y": { | ||
"type": "decimal", | ||
"description": "Binding Affinity" | ||
} | ||
}, | ||
lambda_f = lambda: DTI(name = 'BindingDB_Kd').harmonize_affinities(mode = 'mean').get_data() | ||
) | ||
|
||
binding_db_ic50 = Dataset( | ||
name = "BindingDB_IC50", | ||
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ | ||
structural information and protein amino acid sequence.", | ||
description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \ | ||
considered to be drug-targets with small, drug-like molecules. This is the IC50 dataset.", | ||
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.", | ||
statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.", | ||
split = "Random Split, Cold Drug Split, or Cold Protein Split", | ||
schema = { | ||
"Drug": { | ||
"type": "string", | ||
"description": "Drug SMILES string" | ||
}, | ||
"Target": { | ||
"type": "string", | ||
"description": "Protein Target Amino Acid Sequence" | ||
}, | ||
"Y": { | ||
"type": "decimal", | ||
"description": "Binding Affinity" | ||
} | ||
}, | ||
lambda_f = lambda: DTI(name = 'BindingDB_IC50').harmonize_affinities(mode = 'mean').get_data() | ||
) | ||
|
||
binding_db_ki = Dataset( | ||
name = "BindingDB_Ki", | ||
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ | ||
structural information and protein amino acid sequence.", | ||
description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \ | ||
considered to be drug-targets with small, drug-like molecules. This is the Ki dataset.", | ||
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.", | ||
statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.", | ||
split = "Random Split, Cold Drug Split, or Cold Protein Split", | ||
schema = { | ||
"Drug": { | ||
"type": "string", | ||
"description": "Drug SMILES string" | ||
}, | ||
"Target": { | ||
"type": "string", | ||
"description": "Protein Target Amino Acid Sequence" | ||
}, | ||
"Y": { | ||
"type": "decimal", | ||
"description": "Binding Affinity" | ||
} | ||
}, | ||
lambda_f = lambda: DTI(name = 'BindingDB_Ki').harmonize_affinities(mode = 'mean').get_data() | ||
) | ||
|
||
davis = Dataset( | ||
name = "Davis", | ||
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ | ||
structural information and protein amino acid sequence.", | ||
description = "The interaction of 72 kinase inhibitors with 442 kinases covering >80%% of the human catalytic protein kinome.", | ||
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity", | ||
statistics = "25,772 DTI pairs, 68 drugs, 379 proteins", | ||
split = "Random Split, Cold Drug Split, or Cold Protein Split", | ||
schema = { | ||
"Drug": { | ||
"type": "string", | ||
"description": "Drug SMILES string" | ||
}, | ||
"Target": { | ||
"type": "string", | ||
"description": "Protein Target Amino Acid Sequence" | ||
}, | ||
"Y": { | ||
"type": "decimal", | ||
"description": "Binding Affinity" | ||
} | ||
}, | ||
lambda_f = lambda: DTI(name = 'DAVIS').get_data() | ||
) | ||
|
||
kiba = Dataset( | ||
name = "KIBA", | ||
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ | ||
structural information and protein amino acid sequence.", | ||
description = "Toward making use of the complementary information captured by the various bioactivity types, including IC50, K(i), and K(d), Tang\ | ||
et al. introduces a model-based integration approach, termed KIBA to generate an integrated drug-target bioactivity matrix.", | ||
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity", | ||
statistics = "117,657 DTI pairs, 2,068 drugs, 229 proteins.", | ||
split = "Random Split, Cold Drug Split, or Cold Protein Split", | ||
schema = { | ||
"Drug": { | ||
"type": "string", | ||
"description": "Drug SMILES string" | ||
}, | ||
"Target": { | ||
"type": "string", | ||
"description": "Protein Target Amino Acid Sequence" | ||
}, | ||
"Y": { | ||
"type": "decimal", | ||
"description": "Binding Affinity" | ||
} | ||
}, | ||
lambda_f = lambda: DTI(name = 'KIBA').get_data() | ||
) | ||
|
||
# TODO: datasets for PPI, GDA, DrugRes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
""" | ||
Base class for the action space. An entity. | ||
""" | ||
|
||
import json | ||
|
||
class Entity(dict): | ||
|
||
_UNSERIALIZABLE = [ | ||
"lambda_f" | ||
] | ||
def __init__(self, **kwargs): | ||
super(Entity, self).__init__(**kwargs) | ||
self._description = None | ||
self._functions = None | ||
|
||
def get_json(self): | ||
o = dict(**self) | ||
# remove unserializable keys | ||
for u in self._UNSERIALIZABLE: | ||
if u in o: | ||
del o[u] | ||
return json.dumps(o, indent=4) | ||
|
||
@property | ||
def entity_class_description(self): | ||
assert self._description is not None, "Entity description is None for {}".format(type(self)) | ||
return json.dumps(self._description, indent=4) | ||
|
||
@property | ||
def entity_class_functions(self): | ||
assert self._functions is not None, "Entity functions is None for {}".format(type(self)) | ||
return json.dumps(self._functions, indent=4) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
""" | ||
An object for defining an action in the action space | ||
""" | ||
import json | ||
|
||
class Tool(dict): | ||
|
||
def __init__(self, **kwargs): | ||
super(Tool, self).__init__(**kwargs) | ||
inputs = json.dumps(self, indent=4) | ||
assert "name" in self, "this tool needs a name. read params {}".format(inputs) | ||
assert "type" in self, "this tool needs a type. read params {}".format(inputs) | ||
assert "description" in self, "this tool needs a description. read params {}".format(inputs) | ||
assert "functions" in self, "this tool needs a functions set. read params {}".format(inputs) | ||
assert "instructions" in self, "this tools needs an instruction tuning dataset. read params {}".format(inputs) | ||
|
||
def get_json(self): | ||
return json.dumps(self, indent=4) | ||
|
||
if __name__ == "__main__": | ||
t = Tool(tst=1, ntstx=2) | ||
print("done!") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
import sys | ||
|
||
sys.path.append( | ||
os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) | ||
import unittest | ||
import shutil | ||
|
||
from pandas import DataFrame | ||
from tdc.agents.action_spaces import datasets | ||
|
||
class TestDatasets(unittest.TestCase): | ||
|
||
def setUp(self): | ||
print(os.getcwd()) | ||
pass | ||
|
||
def test_pinnacle(self): | ||
dataset = datasets.pinnacle_opentargets | ||
assert "Y" in dataset.get_schema() | ||
assert type(dataset.get_schema()) == dict, type(dataset.get_schema()) | ||
|
||
# check dataframe | ||
df = dataset.get() | ||
assert isinstance(df, DataFrame) | ||
assert "Y" in df.columns | ||
assert "targetId_genename" in df.columns | ||
print(df.head()) | ||
print(dataset.get_json()) | ||
|
||
def tearDown(self): | ||
try: | ||
print(os.getcwd()) | ||
shutil.rmtree(os.path.join(os.getcwd(), "data")) | ||
except: | ||
pass |