Skip to content

Commit

Permalink
Merge pull request #297 from mims-harvard/pinnacle_networks
Browse files Browse the repository at this point in the history
Pinnacle networks
  • Loading branch information
amva13 authored Jul 30, 2024
2 parents 923fad4 + 4b946a0 commit 10d1b1d
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 0 deletions.
8 changes: 8 additions & 0 deletions tdc/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -922,6 +922,10 @@ def get_task2category():
"tchard_pep_cdr3b_only_sampled_negs_train-2": "tab",
"tchard_pep_cdr3b_only_sampled_negs_train-3": "tab",
"tchard_pep_cdr3b_only_sampled_negs_train-4": "tab",
"cell_tissue_mg_edgelist": "txt",
"pinnacle_global_ppi_edgelist": "txt",
"pinnacle_protein_embed": "pth",
"pinnacle_labels_dict": "txt",
}

name2id = {
Expand Down Expand Up @@ -1094,6 +1098,10 @@ def get_task2category():
"tchard_pep_cdr3b_only_sampled_negs_train-2": 10228322,
"tchard_pep_cdr3b_only_sampled_negs_train-3": 10228316,
"tchard_pep_cdr3b_only_sampled_negs_train-4": 10228326,
"cell_tissue_mg_edgelist": 10407107,
"pinnacle_global_ppi_edgelist": 10407108,
"pinnacle_protein_embed": 10407128,
"pinnacle_labels_dict": 10409635,
}

oracle2type = {
Expand Down
72 changes: 72 additions & 0 deletions tdc/resource/pinnacle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from ..utils import general_load
from ..utils.load import download_wrapper, load_json_from_txt_file

import pandas as pd
import os
import torch


class PINNACLE:
"""
PINNACLE is a class for loading and manipulating the PINNACLE networks and embeddings.
@article{
Li2023,
author = "Michelle Li",
title = "{PINNACLE}",
year = "2023",
month = "4",
url = "https://figshare.com/articles/software/AWARE/22708126",
doi = "10.6084/m9.figshare.22708126.v5"
}
"""

def __init__(self, path="./data"):
self.ppi_name = "pinnacle_global_ppi_edgelist"
self.cell_tissue_mg_name = "cell_tissue_mg_edgelist"
self.ppi = general_load(self.ppi_name, path, " ")
self.ppi.columns = ["Protein A", "Protein B"]
self.cell_tissue_mg = general_load(
self.cell_tissue_mg_name, path,
"\t") # use tab as names were left with spaces
self.cell_tissue_mg.columns = ["Tissue", "Cell"]
self.embeds_name = "pinnacle_protein_embed"
# self.embeds = resource_dataset_load(self.embeds_name, path, [self.embeds_name])
self.embeds_name = download_wrapper(self.embeds_name, path,
self.embeds_name)
self.embeds = torch.load(os.path.join(path, self.embeds_name + ".pth"))
self.keys = load_json_from_txt_file("pinnacle_labels_dict", path)

def get_ppi(self):
return self.ppi

def get_mg(self):
return self.cell_tissue_mg

def get_embeds_raw(self):
return self.embeds

def get_keys(self):
protein_names_celltypes = [
p for p in zip(self.keys["Cell Type"], self.keys["Name"])
if not (p[0].startswith("BTO") or p[0].startswith("CCI") or
p[0].startswith("Sanity"))
]
proteins = pd.DataFrame.from_dict({
"target": [n for _, n in protein_names_celltypes],
"cell type": [c for c, _ in protein_names_celltypes]
})
proteins.drop_duplicates()
return proteins

def get_embeds(self):
prots = self.get_keys()
emb = self.get_embeds_raw()
nemb = {'--'.join(prots.iloc[k]): v for k, v in emb.items()}
x = {}
for k, v in nemb.items():
if isinstance(v, torch.Tensor):
x[k] = pd.DataFrame(v.detach().numpy())
else:
raise Exception("encountered non-tensor")
df = pd.concat(x, axis=0)
return df
29 changes: 29 additions & 0 deletions tdc/test/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,5 +70,34 @@ def test_node_retrieval(self):
assert "1" in l and "9997" in l


class TestPINNACLE(unittest.TestCase):

def setUp(self):
pass

def test_mg_ppi_load(self):
from tdc.resource.pinnacle import PINNACLE
pinnacle = PINNACLE()
assert isinstance(pinnacle.get_ppi(), DataFrame)
assert isinstance(pinnacle.get_mg(), DataFrame)
assert len(pinnacle.get_ppi()) > 0
assert len(pinnacle.get_mg()) > 0
embeds = pinnacle.get_embeds()
assert isinstance(embeds, DataFrame)
assert len(embeds) > 0, "PINNACLE embeds is empty"

def test_embeddings(self):
from tdc.resource.pinnacle import PINNACLE
pinnacle = PINNACLE()
embeds = pinnacle.get_embeds()
assert isinstance(embeds, DataFrame)
assert len(embeds) > 0, "PINNACLE embeds is empty"
keys = pinnacle.get_keys()
assert isinstance(keys, DataFrame)
assert len(keys) > 0, "PINNACLE keys is empty"
assert len(keys) == len(embeds), "{} vs {}".format(
len(keys), len(embeds))


if __name__ == "__main__":
unittest.main()
36 changes: 36 additions & 0 deletions tdc/utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,24 @@ def pd_load(name, path):
r = maxlen - len(v)
file_content[k] = v + [None] * r
df = pd.DataFrame(file_content)

elif name2type[name] == "pth":
import torch
tensors = torch.load(
os.path.join(path, name + "." + name2type[name]))
dfs = {}
if isinstance(tensors, dict):
for k, v in tensors.items():
if isinstance(v, torch.Tensor):
dfs[k] = pd.DataFrame(v.detach().numpy())
else:
raise Exception("encountered non-tensor")
df = pd.concat(dfs, axis=0)
elif torch.is_tensor(tensors):
df = pd.DataFrame(tensors.detach().numpy())
else:
raise Exception("encountered non-tensor")

else:
raise ValueError(
"The file type must be one of tab/csv/xlsx/pickle/zip.")
Expand All @@ -338,6 +356,24 @@ def pd_load(name, path):
)


def load_json_from_txt_file(name, path):
import json
import re
name = download_wrapper(name, path, [name])
file_path = os.path.join(path, name + ".txt")
with open(file_path, 'r') as f:
data = f.read()
# data = re.sub(r"(?<!\\)'", '"', data)
data = data.replace("\'", "\"")
file_content = json.loads(data)
maxlen = max(len(x) for x in file_content.values())
for k, v in file_content.items():
r = maxlen - len(v)
file_content[k] = v + [None] * r
df = pd.DataFrame(file_content)
return df


def property_dataset_load(name, path, target, dataset_names):
"""a wrapper to download, process and load single-instance prediction task datasets
Expand Down

0 comments on commit 10d1b1d

Please sign in to comment.