Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Geneformer deploy #299

Merged
merged 10 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ numpy>=1.26.4,<2.0.0
openpyxl>=3.0.10,<4.0.0
pandas>=2.1.4,<3.0.0
requests>=2.31.0,<3.0.0
# scikit-learn>=1.2.2,<2.0.0
scikit-learn==1.2.2
seaborn>=0.12.2,<1.0.0
tqdm>=4.65.0,<5.0.0
Expand Down
22 changes: 21 additions & 1 deletion tdc/tdc_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
'CYP3A4_Veith-AttentiveFP',
]

model_hub = ["Geneformer"]


class tdc_hf_interface:
'''
Expand All @@ -29,7 +31,10 @@ class tdc_hf_interface:

def __init__(self, repo_name):
self.repo_id = "tdc/" + repo_name
self.model_name = repo_name.split('-')[1]
try:
self.model_name = repo_name.split('-')[1]
except:
self.model_name = repo_name

def upload(self, folder_path):
create_repo(repo_id=self.repo_id)
Expand All @@ -47,6 +52,21 @@ def file_download(self, save_path, filename):
def repo_download(self, save_path):
snapshot_download(repo_id=self.repo_id, cache_dir=save_path)

def load(self):
if self.model_name not in model_hub:
raise Exception("this model is not in the TDC model hub GH repo.")
elif self.model_name == "Geneformer":
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
# tokenizer = AutoTokenizer.from_pretrained("ctheodoris/Geneformer")
model = AutoModelForMaskedLM.from_pretrained(
"ctheodoris/Geneformer")
# pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer)
# pipe = pipeline("fill-mask", model="ctheodoris/Geneformer")
# return pipe
return model
raise Exception("Not implemented yet!")

def load_deeppurpose(self, save_path):
if self.repo_id[4:] in deeppurpose_repo:
save_path = save_path + '/' + self.repo_id[4:]
Expand Down
66 changes: 66 additions & 0 deletions tdc/test/test_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-

from __future__ import division
from __future__ import print_function

import os
import sys

import unittest
import shutil
import pytest

# temporary solution for relative imports in case TDC is not installed
# if TDC is installed, no need to use the following line
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
# TODO: add verification for the generation other than simple integration


class TestHF(unittest.TestCase):

def setUp(self):
print(os.getcwd())
pass

@pytest.mark.skip(
reason="This test is skipped due to deeppurpose installation dependency"
)
@unittest.skip(reason="DeepPurpose")
def test_hf_load_predict(self):
from tdc.single_pred import Tox
data = Tox(name='herg_karim')

from tdc import tdc_hf_interface
tdc_hf = tdc_hf_interface("hERG_Karim-CNN")
# load deeppurpose model from this repo
dp_model = tdc_hf.load_deeppurpose('./data')
tdc_hf.predict_deeppurpose(dp_model, ['CC(=O)NC1=CC=C(O)C=C1'])

def test_hf_transformer(self):
from tdc import tdc_hf_interface
# from transformers import Pipeline
from transformers import BertForMaskedLM as BertModel
geneformer = tdc_hf_interface("Geneformer")
model = geneformer.load()
# assert isinstance(pipeline, Pipeline)
assert isinstance(model, BertModel), type(model)

# def test_hf_load_new_pytorch_standard(self):
# from tdc import tdc_hf_interface
# # from tdc.resource.dataloader import DataLoader
# # data = DataLoader(name="pinnacle_dti")
# tdc_hf = tdc_hf_interface("mli-PINNACLE")
# dp_model = tdc_hf.load()
# assert dp_model is not None

def tearDown(self):
try:
print(os.getcwd())
shutil.rmtree(os.path.join(os.getcwd(), "data"))
except:
pass


if __name__ == "__main__":
unittest.main()
Loading