Skip to content

Commit

Permalink
Building action space. 'Add datasets for various drug-target interact…
Browse files Browse the repository at this point in the history
…ion tasks'
  • Loading branch information
amva13 committed Jul 11, 2024
1 parent 923fad4 commit ba049bc
Show file tree
Hide file tree
Showing 7 changed files with 304 additions and 0 deletions.
Empty file added tdc/agents/__init__.py
Empty file.
Empty file.
213 changes: 213 additions & 0 deletions tdc/agents/action_spaces/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
"""
map with dataset labels and descriptions
"""
from .entity import Entity
from ...resource.dataloader import DataLoader # for pinnacle scdti
from ...multi_pred.dti import DTI # BindingDB

import json

class Dataset(Entity):

def __init__(self, **kwargs):
super().__init__(**kwargs)
self._description = {
"name": "the name of the dataset",
"description": "a description of the dataset",
"task": "The TDC task associated with the dataset",
"ml_task": "The ML Task (i.e., Binary Classification, Regression, etc.) the dataset is created for",
"statistics": "the statistics associated with the dataset",
"split": "the available data splits for the dataset",
"schema": "a mapping of column names to a dictionary containing the type of the column and a description for the column"
}
self._functions = {
"get": "get the dataset as a pandas dataframe",
"get_task": "get the TDC task associated with the dataset",
"get_schema": "get a JSON for the dataset schema"
}

def get(self):
return self["lambda_f"]()

def get_task(self):
return self["task"]

def get_schema(self):
return json.dumps(self["schema"], indent=4)

base = Dataset() # every entity has a base instantiation so agent can just obtain description and functions

# Li, Michelle, et al. dataset entity
pinnacle_opentargets = Dataset(
name = "Li, Michelle, et al.",
task = "scDTI: The goal is to train a model for predicting the probability that a protein is a candidate therapeutic target in a specific \
cell type. The model learns an estimator for a function of a protein target and a cell-type-specific biological context as input, and \
the model is tasked to predict the probability the candidate protein is a therapeutic target in that cell type.",
description = "To curate target information for a therapeutic area, we examine the drugs indicated for the therapeutic area of interest and\
its descendants. The two therapeutic areas examined are rheumatoid arthritis (RA) and inflammatory bowel disease. Positive examples (i.e.,\
where the label y = 1) are proteins targeted by drugs that have at least completed phase 2 of clinical trials for treating a specific \
therapeutic area. As such, a protein is a promising candidate if a compound that targets the protein is safe for humans and effective \
for treating the disease. We retain positive training examples activated in at least one cell type-specific protein interaction network.\
We define negative examples (i.e., where the label y = 0) as druggable proteins that do not have any known association with the \
therapeutic area of interest according to Open Targets. A protein is deemed druggable if targeted by at least one existing drug.\
We extract drugs and their nominal targets from Drugbank. We retain negative training examples activated in at least one cell\
type-specific protein interaction network.",
ml_task = "Classification. Given the protein and cell-context, predict whether the protein is a therapeutic target.",
statistics = "The final number of positive (negative) samples for RA and IBD were 152 (1,465) and 114 (1,377), respectively. In PINNACLE, \
this dataset was augmented to include 156 cell types.",
split = "Cold Protein Split. We split the dataset such that about 80%% of the proteins are in the training set, about 10%% of the proteins are in\
the validation set, and about 10%% of the proteins are in the test set. The data splits are consistent for each cell type context to avoid data\
leakage.",
schema = {
"diseaseId": {
"type": "string",
"description": "Disease ID. The two therapeutic areas examined are rheumatoid arthritis (RA) and inflammatory bowel disease. For \
rheumatoid arthritis, we collected therapeutic data (i.e., targets of drugs indicated for the therapeutic area) from OpenTargets \
for rheumatoid arthritis (EFO 0000685), ankylosing spondylitis (EFO 0003898), and psoriatic arthritis (EFO 0003778). \
For inflammatory bowel disease, we collected therapeutic data for ulcerative colitis (EFO 0000729), collagenous colitis \
(EFO 1001293), colitis (EFO 0003872), proctitis (EFO 0005628), Crohn’s colitis (EFO 0005622), lymphocytic colitis (EFO 1001294),\
Crohn’s disease (EFO 0000384), microscopic colitis (EFO 1001295), inflammatory bowel disease (EFO 0003767), \
appendicitis (EFO 0007149), ulcerative proctosigmoiditis (EFO 1001223), and small bowel Crohn’s disease (EFO 0005629)."
},
"targetId_genename": {
"type": "string",
"description": "Protein Target"
},
"Y": {
"type": "int",
"description": "Binary Indicator. Y=1 if the protein is a viable drug target for this disease and Y=0 otherwise."
}
},
lambda_f = lambda: DataLoader("opentargets_dti").get_data()
)

binding_db_kd = Dataset(
name = "BindingDB_Kd",
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
structural information and protein amino acid sequence.",
description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \
considered to be drug-targets with small, drug-like molecules. This is the Kd dataset.",
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.",
statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.",
split = "Random Split, Cold Drug Split, or Cold Protein Split",
schema = {
"Drug": {
"type": "string",
"description": "Drug SMILES string"
},
"Target": {
"type": "string",
"description": "Protein Target Amino Acid Sequence"
},
"Y": {
"type": "decimal",
"description": "Binding Affinity"
}
},
lambda_f = lambda: DTI(name = 'BindingDB_Kd').harmonize_affinities(mode = 'mean').get_data()
)

binding_db_ic50 = Dataset(
name = "BindingDB_IC50",
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
structural information and protein amino acid sequence.",
description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \
considered to be drug-targets with small, drug-like molecules. This is the IC50 dataset.",
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.",
statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.",
split = "Random Split, Cold Drug Split, or Cold Protein Split",
schema = {
"Drug": {
"type": "string",
"description": "Drug SMILES string"
},
"Target": {
"type": "string",
"description": "Protein Target Amino Acid Sequence"
},
"Y": {
"type": "decimal",
"description": "Binding Affinity"
}
},
lambda_f = lambda: DTI(name = 'BindingDB_IC50').harmonize_affinities(mode = 'mean').get_data()
)

binding_db_ki = Dataset(
name = "BindingDB_Ki",
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
structural information and protein amino acid sequence.",
description = "BindingDB is a public, web-accessible database of measured binding affinities, focusing chiefly on the interactions of protein \
considered to be drug-targets with small, drug-like molecules. This is the Ki dataset.",
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity.",
statistics = "(# of DTI pairs, # of drugs, # of proteins) 52,284/10,665/1,413 for Kd, 991,486/549,205/5,078 for IC50, 375,032/174,662/3,070 for Ki.",
split = "Random Split, Cold Drug Split, or Cold Protein Split",
schema = {
"Drug": {
"type": "string",
"description": "Drug SMILES string"
},
"Target": {
"type": "string",
"description": "Protein Target Amino Acid Sequence"
},
"Y": {
"type": "decimal",
"description": "Binding Affinity"
}
},
lambda_f = lambda: DTI(name = 'BindingDB_Ki').harmonize_affinities(mode = 'mean').get_data()
)

davis = Dataset(
name = "Davis",
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
structural information and protein amino acid sequence.",
description = "The interaction of 72 kinase inhibitors with 442 kinases covering >80%% of the human catalytic protein kinome.",
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity",
statistics = "25,772 DTI pairs, 68 drugs, 379 proteins",
split = "Random Split, Cold Drug Split, or Cold Protein Split",
schema = {
"Drug": {
"type": "string",
"description": "Drug SMILES string"
},
"Target": {
"type": "string",
"description": "Protein Target Amino Acid Sequence"
},
"Y": {
"type": "decimal",
"description": "Binding Affinity"
}
},
lambda_f = lambda: DTI(name = 'DAVIS').get_data()
)

kiba = Dataset(
name = "KIBA",
task = "DTI: Drug-target interaction prediction task aims to predict the interaction activity score in silico given only the accessible compound \
structural information and protein amino acid sequence.",
description = "Toward making use of the complementary information captured by the various bioactivity types, including IC50, K(i), and K(d), Tang\
et al. introduces a model-based integration approach, termed KIBA to generate an integrated drug-target bioactivity matrix.",
ml_task = "Regression. Given the target amino acid sequence/compound SMILES string, predict their binding affinity",
statistics = "117,657 DTI pairs, 2,068 drugs, 229 proteins.",
split = "Random Split, Cold Drug Split, or Cold Protein Split",
schema = {
"Drug": {
"type": "string",
"description": "Drug SMILES string"
},
"Target": {
"type": "string",
"description": "Protein Target Amino Acid Sequence"
},
"Y": {
"type": "decimal",
"description": "Binding Affinity"
}
},
lambda_f = lambda: DTI(name = 'KIBA').get_data()
)

# TODO: datasets for PPI, GDA, DrugRes
33 changes: 33 additions & 0 deletions tdc/agents/action_spaces/entity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
Base class for the action space. An entity.
"""

import json

class Entity(dict):

_UNSERIALIZABLE = [
"lambda_f"
]
def __init__(self, **kwargs):
super(Entity, self).__init__(**kwargs)
self._description = None
self._functions = None

def get_json(self):
o = dict(**self)
# remove unserializable keys
for u in self._UNSERIALIZABLE:
if u in o:
del o[u]
return json.dumps(o, indent=4)

@property
def entity_class_description(self):
assert self._description is not None, "Entity description is None for {}".format(type(self))
return json.dumps(self._description, indent=4)

@property
def entity_class_functions(self):
assert self._functions is not None, "Entity functions is None for {}".format(type(self))
return json.dumps(self._functions, indent=4)
Empty file.
22 changes: 22 additions & 0 deletions tdc/agents/action_spaces/tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""
An object for defining an action in the action space
"""
import json

class Tool(dict):

def __init__(self, **kwargs):
super(Tool, self).__init__(**kwargs)
inputs = json.dumps(self, indent=4)
assert "name" in self, "this tool needs a name. read params {}".format(inputs)
assert "type" in self, "this tool needs a type. read params {}".format(inputs)
assert "description" in self, "this tool needs a description. read params {}".format(inputs)
assert "functions" in self, "this tool needs a functions set. read params {}".format(inputs)
assert "instructions" in self, "this tools needs an instruction tuning dataset. read params {}".format(inputs)

def get_json(self):
return json.dumps(self, indent=4)

if __name__ == "__main__":
t = Tool(tst=1, ntstx=2)
print("done!")
36 changes: 36 additions & 0 deletions tdc/test/test_agents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import os
import sys

sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
import unittest
import shutil

from pandas import DataFrame
from tdc.agents.action_spaces import datasets

class TestDatasets(unittest.TestCase):

def setUp(self):
print(os.getcwd())
pass

def test_pinnacle(self):
dataset = datasets.pinnacle_opentargets
assert "Y" in dataset.get_schema()
assert type(dataset.get_schema()) == dict, type(dataset.get_schema())

# check dataframe
df = dataset.get()
assert isinstance(df, DataFrame)
assert "Y" in df.columns
assert "targetId_genename" in df.columns
print(df.head())
print(dataset.get_json())

def tearDown(self):
try:
print(os.getcwd())
shutil.rmtree(os.path.join(os.getcwd(), "data"))
except:
pass

0 comments on commit ba049bc

Please sign in to comment.