diff --git a/tdc/agents/__init__.py b/tdc/agents/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tdc/agents/action_spaces/__init__.py b/tdc/agents/action_spaces/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tdc/agents/action_spaces/datasets.py b/tdc/agents/action_spaces/datasets.py new file mode 100644 index 00000000..5e88583f --- /dev/null +++ b/tdc/agents/action_spaces/datasets.py @@ -0,0 +1,213 @@ +""" +map with dataset labels and descriptions +""" +from .entity import Entity +from ...resource.dataloader import DataLoader # for pinnacle scdti +from ...multi_pred.dti import DTI # BindingDB + +import json + +class Dataset(Entity): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + self._description = { + "name": "the name of the dataset", + "description": "a description of the dataset", + "task": "The TDC task associated with the dataset", + "ml_task": "The ML Task (i.e., Binary Classification, Regression, etc.) the dataset is created for", + "statistics": "the statistics associated with the dataset", + "split": "the available data splits for the dataset", + "schema": "a mapping of column names to a dictionary containing the type of the column and a description for the column" + } + self._functions = { + "get": "get the dataset as a pandas dataframe", + "get_task": "get the TDC task associated with the dataset", + "get_schema": "get a JSON for the dataset schema" + } + + def get(self): + return self["lambda_f"]() + + def get_task(self): + return self["task"] + + def get_schema(self): + return json.dumps(self["schema"], indent=4) + +base = Dataset() # every entity has a base instantiation so agent can just obtain description and functions + +# Li, Michelle, et al. dataset entity +pinnacle_opentargets = Dataset( + name = "Li, Michelle, et al.", + task = "scDTI: The goal is to train a model for predicting the probability that a protein is a candidate therapeutic target in a specific \ + cell type. The model learns an estimator for a function of a protein target and a cell-type-specific biological context as input, and \ + the model is tasked to predict the probability the candidate protein is a therapeutic target in that cell type.", + description = "To curate target information for a therapeutic area, we examine the drugs indicated for the therapeutic area of interest and\ + its descendants. The two therapeutic areas examined are rheumatoid arthritis (RA) and inflammatory bowel disease. Positive examples (i.e.,\ + where the label y = 1) are proteins targeted by drugs that have at least completed phase 2 of clinical trials for treating a specific \ + therapeutic area. As such, a protein is a promising candidate if a compound that targets the protein is safe for humans and effective \ + for treating the disease. We retain positive training examples activated in at least one cell type-specific protein interaction network.\ + We define negative examples (i.e., where the label y = 0) as druggable proteins that do not have any known association with the \ + therapeutic area of interest according to Open Targets. A protein is deemed druggable if targeted by at least one existing drug.\ + We extract drugs and their nominal targets from Drugbank. We retain negative training examples activated in at least one cell\ + type-specific protein interaction network.", + ml_task = "Classification. Given the protein and cell-context, predict whether the protein is a therapeutic target.", + statistics = "The final number of positive (negative) samples for RA and IBD were 152 (1,465) and 114 (1,377), respectively. In PINNACLE, \ + this dataset was augmented to include 156 cell types.", + split = "Cold Protein Split. We split the dataset such that about 80%% of the proteins are in the training set, about 10%% of the proteins are in\ + the validation set, and about 10%% of the proteins are in the test set. The data splits are consistent for each cell type context to avoid data\ + leakage.", + schema = { + "diseaseId": { + "type": "string", + "description": "Disease ID. The two therapeutic areas examined are rheumatoid arthritis (RA) and inflammatory bowel disease. For \ + rheumatoid arthritis, we collected therapeutic data (i.e., targets of drugs indicated for the therapeutic area) from OpenTargets \ + for rheumatoid arthritis (EFO 0000685), ankylosing spondylitis (EFO 0003898), and psoriatic arthritis (EFO 0003778). \ + For inflammatory bowel disease, we collected therapeutic data for ulcerative colitis (EFO 0000729), collagenous colitis \ + (EFO 1001293), colitis (EFO 0003872), proctitis (EFO 0005628), Crohn’s colitis (EFO 0005622), lymphocytic colitis (EFO 1001294),\ + Crohn’s disease (EFO 0000384), microscopic colitis (EFO 1001295), inflammatory bowel disease (EFO 0003767), \ + appendicitis (EFO 0007149), ulcerative proctosigmoiditis (EFO 1001223), and small bowel Crohn’s disease (EFO 0005629)." + }, + "targetId_genename": { + "type": "string", + "description": "Protein Target" + }, + "Y": { + "type": "int", + "description": "Binary Indicator. Y=1 if the protein is a viable drug target for this disease and Y=0 otherwise." + } + }, + lambda_f = lambda: DataLoader("opentargets_dti").get_data() +) + +binding_db_kd = Dataset( + name = "BindingDB_Kd", + task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ + structural information and protein amino acid sequence.", + description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \ + considered to be drug-targets with small, drug-like molecules. This is the Kd dataset.", + ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.", + statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.", + split = "Random Split, Cold Drug Split, or Cold Protein Split", + schema = { + "Drug": { + "type": "string", + "description": "Drug SMILES string" + }, + "Target": { + "type": "string", + "description": "Protein Target Amino Acid Sequence" + }, + "Y": { + "type": "decimal", + "description": "Binding Affinity" + } + }, + lambda_f = lambda: DTI(name = 'BindingDB_Kd').harmonize_affinities(mode = 'mean').get_data() +) + +binding_db_ic50 = Dataset( + name = "BindingDB_IC50", + task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ + structural information and protein amino acid sequence.", + description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \ + considered to be drug-targets with small, drug-like molecules. This is the IC50 dataset.", + ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.", + statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.", + split = "Random Split, Cold Drug Split, or Cold Protein Split", + schema = { + "Drug": { + "type": "string", + "description": "Drug SMILES string" + }, + "Target": { + "type": "string", + "description": "Protein Target Amino Acid Sequence" + }, + "Y": { + "type": "decimal", + "description": "Binding Affinity" + } + }, + lambda_f = lambda: DTI(name = 'BindingDB_IC50').harmonize_affinities(mode = 'mean').get_data() +) + +binding_db_ki = Dataset( + name = "BindingDB_Ki", + task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ + structural information and protein amino acid sequence.", + description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \ + considered to be drug-targets with small, drug-like molecules. This is the Ki dataset.", + ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.", + statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.", + split = "Random Split, Cold Drug Split, or Cold Protein Split", + schema = { + "Drug": { + "type": "string", + "description": "Drug SMILES string" + }, + "Target": { + "type": "string", + "description": "Protein Target Amino Acid Sequence" + }, + "Y": { + "type": "decimal", + "description": "Binding Affinity" + } + }, + lambda_f = lambda: DTI(name = 'BindingDB_Ki').harmonize_affinities(mode = 'mean').get_data() +) + +davis = Dataset( + name = "Davis", + task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ + structural information and protein amino acid sequence.", + description = "The interaction of 72 kinase inhibitors with 442 kinases covering >80%% of the human catalytic protein kinome.", + ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity", + statistics = "25,772 DTI pairs, 68 drugs, 379 proteins", + split = "Random Split, Cold Drug Split, or Cold Protein Split", + schema = { + "Drug": { + "type": "string", + "description": "Drug SMILES string" + }, + "Target": { + "type": "string", + "description": "Protein Target Amino Acid Sequence" + }, + "Y": { + "type": "decimal", + "description": "Binding Affinity" + } + }, + lambda_f = lambda: DTI(name = 'DAVIS').get_data() +) + +kiba = Dataset( + name = "KIBA", + task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \ + structural information and protein amino acid sequence.", + description = "Toward making use of the complementary information captured by the various bioactivity types, including IC50, K(i), and K(d), Tang\ + et al. introduces a model-based integration approach, termed KIBA to generate an integrated drug-target bioactivity matrix.", + ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity", + statistics = "117,657 DTI pairs, 2,068 drugs, 229 proteins.", + split = "Random Split, Cold Drug Split, or Cold Protein Split", + schema = { + "Drug": { + "type": "string", + "description": "Drug SMILES string" + }, + "Target": { + "type": "string", + "description": "Protein Target Amino Acid Sequence" + }, + "Y": { + "type": "decimal", + "description": "Binding Affinity" + } + }, + lambda_f = lambda: DTI(name = 'KIBA').get_data() +) + +# TODO: datasets for PPI, GDA, DrugRes \ No newline at end of file diff --git a/tdc/agents/action_spaces/entity.py b/tdc/agents/action_spaces/entity.py new file mode 100644 index 00000000..a3a234b7 --- /dev/null +++ b/tdc/agents/action_spaces/entity.py @@ -0,0 +1,33 @@ +""" +Base class for the action space. An entity. +""" + +import json + +class Entity(dict): + + _UNSERIALIZABLE = [ + "lambda_f" + ] + def __init__(self, **kwargs): + super(Entity, self).__init__(**kwargs) + self._description = None + self._functions = None + + def get_json(self): + o = dict(**self) + # remove unserializable keys + for u in self._UNSERIALIZABLE: + if u in o: + del o[u] + return json.dumps(o, indent=4) + + @property + def entity_class_description(self): + assert self._description is not None, "Entity description is None for {}".format(type(self)) + return json.dumps(self._description, indent=4) + + @property + def entity_class_functions(self): + assert self._functions is not None, "Entity functions is None for {}".format(type(self)) + return json.dumps(self._functions, indent=4) \ No newline at end of file diff --git a/tdc/agents/action_spaces/prism.py b/tdc/agents/action_spaces/prism.py new file mode 100644 index 00000000..e69de29b diff --git a/tdc/agents/action_spaces/tool.py b/tdc/agents/action_spaces/tool.py new file mode 100644 index 00000000..a1a4744d --- /dev/null +++ b/tdc/agents/action_spaces/tool.py @@ -0,0 +1,22 @@ +""" +An object for defining an action in the action space +""" +import json + +class Tool(dict): + + def __init__(self, **kwargs): + super(Tool, self).__init__(**kwargs) + inputs = json.dumps(self, indent=4) + assert "name" in self, "this tool needs a name. read params {}".format(inputs) + assert "type" in self, "this tool needs a type. read params {}".format(inputs) + assert "description" in self, "this tool needs a description. read params {}".format(inputs) + assert "functions" in self, "this tool needs a functions set. read params {}".format(inputs) + assert "instructions" in self, "this tools needs an instruction tuning dataset. read params {}".format(inputs) + + def get_json(self): + return json.dumps(self, indent=4) + +if __name__ == "__main__": + t = Tool(tst=1, ntstx=2) + print("done!") \ No newline at end of file diff --git a/tdc/test/test_agents.py b/tdc/test/test_agents.py new file mode 100644 index 00000000..a2b959cd --- /dev/null +++ b/tdc/test/test_agents.py @@ -0,0 +1,36 @@ +import os +import sys + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))) +import unittest +import shutil + +from pandas import DataFrame +from tdc.agents.action_spaces import datasets + +class TestDatasets(unittest.TestCase): + + def setUp(self): + print(os.getcwd()) + pass + + def test_pinnacle(self): + dataset = datasets.pinnacle_opentargets + assert "Y" in dataset.get_schema() + assert type(dataset.get_schema()) == dict, type(dataset.get_schema()) + + # check dataframe + df = dataset.get() + assert isinstance(df, DataFrame) + assert "Y" in df.columns + assert "targetId_genename" in df.columns + print(df.head()) + print(dataset.get_json()) + + def tearDown(self): + try: + print(os.getcwd()) + shutil.rmtree(os.path.join(os.getcwd(), "data")) + except: + pass \ No newline at end of file