Building action space. 'Add datasets for various drug-target interact…

…ion tasks'
mims-harvard · Jul 11, 2024 · ba049bc · ba049bc
1 parent 923fad4
commit ba049bc
Show file tree

Hide file tree

Showing 7 changed files with 304 additions and 0 deletions.
diff --git a/tdc/agents/__init__.py b/tdc/agents/__init__.py
diff --git a/tdc/agents/action_spaces/__init__.py b/tdc/agents/action_spaces/__init__.py
diff --git a/tdc/agents/action_spaces/datasets.py b/tdc/agents/action_spaces/datasets.py
@@ -0,0 +1,213 @@
+"""
+map with dataset labels and descriptions
+"""
+from .entity import Entity
+from ...resource.dataloader import DataLoader # for pinnacle scdti
+from ...multi_pred.dti import DTI # BindingDB
+
+import json
+
+class Dataset(Entity):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._description = {
+            "name": "the name of the dataset",
+            "description": "a description of the dataset",
+            "task": "The TDC task associated with the dataset",
+            "ml_task": "The ML Task (i.e., Binary Classification, Regression, etc.) the dataset is created for",
+            "statistics": "the statistics associated with the dataset",
+            "split": "the available data splits for the dataset",
+            "schema": "a mapping of column names to a dictionary containing the type of the column and a description for the column"
+        }
+        self._functions = {
+            "get": "get the dataset as a pandas dataframe",
+            "get_task": "get the TDC task associated with the dataset",
+            "get_schema": "get a JSON for the dataset schema"
+        }
+
+    def get(self):
+        return self["lambda_f"]()
+
+    def get_task(self):
+        return self["task"]
+
+    def get_schema(self):
+        return json.dumps(self["schema"], indent=4)
+
+base = Dataset() # every entity has a base instantiation so agent can just obtain description and functions
+
+# Li, Michelle, et al. dataset entity
+pinnacle_opentargets = Dataset(
+    name = "Li, Michelle, et al.",
+    task = "scDTI: The goal is to train a model for predicting the probability that a protein is a candidate therapeutic target in a specific \
+        cell type. The model learns an estimator for a function of a protein target and a cell-type-specific biological context as input, and \
+            the model is tasked to predict the probability the candidate protein is a therapeutic target in that cell type.",
+    description = "To curate target information for a therapeutic area, we examine the drugs indicated for the therapeutic area of interest and\
+        its descendants. The two therapeutic areas examined are rheumatoid arthritis (RA) and inflammatory bowel disease. Positive examples (i.e.,\
+            where the label y = 1) are proteins targeted by drugs that have at least completed phase 2 of clinical trials for treating a specific \
+                therapeutic area. As such, a protein is a promising candidate if a compound that targets the protein is safe for humans and effective \
+                    for treating the disease. We retain positive training examples activated in at least one cell type-specific protein interaction network.\
+                        We define negative examples (i.e., where the label y = 0) as druggable proteins that do not have any known association with the \
+                            therapeutic area of interest according to Open Targets. A protein is deemed druggable if targeted by at least one existing drug.\
+                                We extract drugs and their nominal targets from Drugbank. We retain negative training examples activated in at least one cell\
+                                    type-specific protein interaction network.",
+    ml_task = "Classification. Given the protein and cell-context, predict whether the protein is a therapeutic target.",
+    statistics = "The final number of positive (negative) samples for RA and IBD were 152 (1,465) and 114 (1,377), respectively. In PINNACLE, \
+        this dataset was augmented to include 156 cell types.",
+    split = "Cold Protein Split. We split the dataset such that about 80%% of the proteins are in the training set, about 10%% of the proteins are in\
+        the validation set, and about 10%% of the proteins are in the test set. The data splits are consistent for each cell type context to avoid data\
+            leakage.",
+    schema = {
+        "diseaseId": {
+            "type": "string",
+            "description": "Disease ID. The two therapeutic areas examined are rheumatoid arthritis (RA) and inflammatory bowel disease. For \
+                rheumatoid arthritis, we collected therapeutic data (i.e., targets of drugs indicated for the therapeutic area) from OpenTargets \
+                    for rheumatoid arthritis (EFO 0000685), ankylosing spondylitis (EFO 0003898), and psoriatic arthritis (EFO 0003778). \
+                        For inflammatory bowel disease, we collected therapeutic data for ulcerative colitis (EFO 0000729), collagenous colitis \
+                            (EFO 1001293), colitis (EFO 0003872), proctitis (EFO 0005628), Crohn’s colitis (EFO 0005622), lymphocytic colitis (EFO 1001294),\
+                                Crohn’s disease (EFO 0000384), microscopic colitis (EFO 1001295), inflammatory bowel disease (EFO 0003767), \
+                                    appendicitis (EFO 0007149), ulcerative proctosigmoiditis (EFO 1001223), and small bowel Crohn’s disease (EFO 0005629)."
+        },
+        "targetId_genename": {
+            "type": "string",
+            "description": "Protein Target"
+        },
+        "Y": {
+            "type": "int",
+            "description": "Binary Indicator. Y=1 if the protein is a viable drug target for this disease and Y=0 otherwise."
+        }
+    },
+    lambda_f = lambda: DataLoader("opentargets_dti").get_data()
+)
+
+binding_db_kd = Dataset(
+    name = "BindingDB_Kd",
+    task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
+        structural information and protein amino acid sequence.",
+    description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \
+        considered to be drug-targets with small, drug-like molecules. This is the Kd dataset.",
+    ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.",
+    statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.",
+    split = "Random Split, Cold Drug Split, or Cold Protein Split",
+    schema = {
+        "Drug": {
+            "type": "string",
+            "description": "Drug SMILES string"
+        },
+        "Target": {
+            "type": "string",
+            "description": "Protein Target Amino Acid Sequence"
+        },
+        "Y": {
+            "type": "decimal",
+            "description": "Binding Affinity"
+        }
+    },
+    lambda_f = lambda: DTI(name = 'BindingDB_Kd').harmonize_affinities(mode = 'mean').get_data()
+)
+
+binding_db_ic50 = Dataset(
+    name = "BindingDB_IC50",
+    task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
+        structural information and protein amino acid sequence.",
+    description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \
+        considered to be drug-targets with small, drug-like molecules. This is the IC50 dataset.",
+    ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.",
+    statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.",
+    split = "Random Split, Cold Drug Split, or Cold Protein Split",
+    schema = {
+        "Drug": {
+            "type": "string",
+            "description": "Drug SMILES string"
+        },
+        "Target": {
+            "type": "string",
+            "description": "Protein Target Amino Acid Sequence"
+        },
+        "Y": {
+            "type": "decimal",
+            "description": "Binding Affinity"
+        }
+    },
+    lambda_f = lambda: DTI(name = 'BindingDB_IC50').harmonize_affinities(mode = 'mean').get_data()
+)
+
+binding_db_ki = Dataset(
+    name = "BindingDB_Ki",
+    task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
+        structural information and protein amino acid sequence.",
+    description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \
+        considered to be drug-targets with small, drug-like molecules. This is the Ki dataset.",
+    ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.",
+    statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.",
+    split = "Random Split, Cold Drug Split, or Cold Protein Split",
+    schema = {
+        "Drug": {
+            "type": "string",
+            "description": "Drug SMILES string"
+        },
+        "Target": {
+            "type": "string",
+            "description": "Protein Target Amino Acid Sequence"
+        },
+        "Y": {
+            "type": "decimal",
+            "description": "Binding Affinity"
+        }
+    },
+    lambda_f = lambda: DTI(name = 'BindingDB_Ki').harmonize_affinities(mode = 'mean').get_data()
+)
+
+davis = Dataset(
+    name = "Davis",
+    task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
+        structural information and protein amino acid sequence.",
+    description = "The interaction of 72 kinase inhibitors with 442 kinases covering >80%% of the human catalytic protein kinome.",
+    ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity",
+    statistics = "25,772 DTI pairs, 68 drugs, 379 proteins",
+    split = "Random Split, Cold Drug Split, or Cold Protein Split",
+    schema = {
+        "Drug": {
+            "type": "string",
+            "description": "Drug SMILES string"
+        },
+        "Target": {
+            "type": "string",
+            "description": "Protein Target Amino Acid Sequence"
+        },
+        "Y": {
+            "type": "decimal",
+            "description": "Binding Affinity"
+        }
+    },
+    lambda_f = lambda: DTI(name = 'DAVIS').get_data()
+)
+
+kiba = Dataset(
+    name = "KIBA",
+    task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
+        structural information and protein amino acid sequence.",
+    description = "Toward making use of the complementary information captured by the various bioactivity types, including IC50, K(i), and K(d), Tang\
+        et al. introduces a model-based integration approach, termed KIBA to generate an integrated drug-target bioactivity matrix.",
+    ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity",
+    statistics = "117,657 DTI pairs, 2,068 drugs, 229 proteins.",
+    split = "Random Split, Cold Drug Split, or Cold Protein Split",
+    schema = {
+        "Drug": {
+            "type": "string",
+            "description": "Drug SMILES string"
+            },
+        "Target": {
+            "type": "string",
+            "description": "Protein Target Amino Acid Sequence"
+            },
+        "Y": {
+            "type": "decimal",
+            "description": "Binding Affinity"
+            }
+        },
+    lambda_f = lambda: DTI(name = 'KIBA').get_data()
+)
+
+# TODO: datasets for PPI, GDA, DrugRes
diff --git a/tdc/agents/action_spaces/entity.py b/tdc/agents/action_spaces/entity.py
@@ -0,0 +1,33 @@
+"""
+Base class for the action space. An entity.
+"""
+
+import json
+
+class Entity(dict):
+
+    _UNSERIALIZABLE = [
+        "lambda_f"
+    ]
+    def __init__(self, **kwargs):
+        super(Entity, self).__init__(**kwargs)
+        self._description = None
+        self._functions = None
+
+    def get_json(self):
+        o = dict(**self)
+        # remove unserializable keys
+        for u in self._UNSERIALIZABLE:
+            if u in o:
+                del o[u]
+        return json.dumps(o, indent=4)
+
+    @property
+    def entity_class_description(self):
+        assert self._description is not None, "Entity description is None for {}".format(type(self))
+        return json.dumps(self._description, indent=4)
+
+    @property
+    def entity_class_functions(self):
+        assert self._functions is not None, "Entity functions is None for {}".format(type(self))
+        return json.dumps(self._functions, indent=4)
diff --git a/tdc/agents/action_spaces/prism.py b/tdc/agents/action_spaces/prism.py
diff --git a/tdc/agents/action_spaces/tool.py b/tdc/agents/action_spaces/tool.py
@@ -0,0 +1,22 @@
+"""
+An object for defining an action in the action space
+"""
+import json
+
+class Tool(dict):
+
+    def __init__(self, **kwargs):
+        super(Tool, self).__init__(**kwargs)
+        inputs = json.dumps(self, indent=4)
+        assert "name" in self, "this tool needs a name. read params {}".format(inputs)
+        assert "type" in self, "this tool needs a type. read params {}".format(inputs)
+        assert "description" in self, "this tool needs a description. read params {}".format(inputs)
+        assert "functions" in self, "this tool needs a functions set. read params {}".format(inputs)
+        assert "instructions" in self, "this tools needs an instruction tuning dataset. read params {}".format(inputs)
+
+    def get_json(self):
+        return json.dumps(self, indent=4)
+
+if __name__ == "__main__":
+    t = Tool(tst=1, ntstx=2)
+    print("done!")
diff --git a/tdc/test/test_agents.py b/tdc/test/test_agents.py
@@ -0,0 +1,36 @@
+import os
+import sys
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+import unittest
+import shutil
+
+from pandas import DataFrame
+from tdc.agents.action_spaces import datasets
+
+class TestDatasets(unittest.TestCase):
+
+    def setUp(self):
+        print(os.getcwd())
+        pass
+
+    def test_pinnacle(self):
+        dataset = datasets.pinnacle_opentargets
+        assert "Y" in dataset.get_schema()
+        assert type(dataset.get_schema()) == dict, type(dataset.get_schema())
+
+        # check dataframe
+        df = dataset.get()
+        assert isinstance(df, DataFrame)
+        assert "Y" in df.columns
+        assert "targetId_genename" in df.columns
+        print(df.head())
+        print(dataset.get_json())
+
+    def tearDown(self):
+        try:
+            print(os.getcwd())
+            shutil.rmtree(os.path.join(os.getcwd(), "data"))
+        except:
+            pass