From 70c37734f7f8b019d730186ed42ae982e2711b1c Mon Sep 17 00:00:00 2001 From: shamikbose Date: Sun, 3 Jul 2022 15:56:49 -0400 Subject: [PATCH 1/3] WIP Tagging errors down from 475 to 247 --- .../thomas2011/download_asbtracts.py | 27 ++++ bigbio/biodatasets/thomas2011/thomas2011.py | 130 ++++++++---------- 2 files changed, 82 insertions(+), 75 deletions(-) create mode 100644 bigbio/biodatasets/thomas2011/download_asbtracts.py diff --git a/bigbio/biodatasets/thomas2011/download_asbtracts.py b/bigbio/biodatasets/thomas2011/download_asbtracts.py new file mode 100644 index 00000000..fc2e6e7a --- /dev/null +++ b/bigbio/biodatasets/thomas2011/download_asbtracts.py @@ -0,0 +1,27 @@ +import requests +import os +import xml.etree.ElementTree as ET + +for id in ["17563728", "17548681", "17566096"]: + url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + params = { + "db": "pubmed", + "id": id, + "retmode": "xml", + "rettype": "medline", + } + res = requests.get(url, params=params) + blank_line_count = 0 + required_text_lines = [] + tree = ET.XML(res.text) + article = tree.find("PubmedArticle").find("MedlineCitation").find("Article") + article_title = article.find("ArticleTitle").text + abstract_parts = [article_title] + article_abstract = article.find("Abstract").findall("AbstractText") + for abstract_part in article_abstract: + print(abstract_part.attrib) + abstract_parts.append(abstract_part.text) + print( + f'PMID: {id}\nTitle: {abstract_parts[0]}\nAbstract: {"".join(abstract_parts[1:])}' + ) + print("|=" * 45) diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index 427c4be2..b0e4745b 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -43,15 +43,16 @@ from pathlib import Path from shutil import rmtree from typing import Dict, List, Tuple - +import xml.etree.ElementTree as ET import datasets import pandas as pd +import requests from bigbio.utils import schemas from bigbio.utils.configs import BigBioConfig from bigbio.utils.constants import Lang, Tasks from bigbio.utils.license import CustomLicense - +import time _LANGUAGES = [Lang.EN] _PUBMED = True @@ -98,8 +99,7 @@ # this is a backup url in case the official one will stop working # _URLS = ["http://github.com/rockt/SETH/zipball/master/"] _URLS = { - "source": "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz", - "bigbio_kb": "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz", + _DATASETNAME: "https://www.scai.fraunhofer.de/content/dam/scai/de/downloads/bioinformatik/normalization-variation-corpus.gz", } _SUPPORTED_TASKS = [ @@ -117,35 +117,16 @@ class Thomas2011Dataset(datasets.GeneratorBasedBuilder): SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) - # You will be able to load the "source" or "bigbio" configurations with - # ds_source = datasets.load_dataset('my_dataset', name='source') - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio') - - # For local datasets you can make use of the `data_dir` and `data_files` kwargs - # https://huggingface.co/docs/datasets/add_dataset.html#downloading-data-files-and-organizing-splits - # ds_source = datasets.load_dataset('my_dataset', name='source', data_dir="/path/to/data/files") - # ds_bigbio = datasets.load_dataset('my_dataset', name='bigbio', data_dir="/path/to/data/files") - - # TODO: For each dataset, implement Config for Source and BigBio; - # If dataset contains more than one subset (see examples/bioasq.py) implement for EACH of them. - # Each of them should contain: - # - name: should be unique for each dataset config eg. bioasq10b_(source|bigbio)_[bigbio_schema_name] - # - version: option = (SOURCE_VERSION|BIGBIO_VERSION) - # - description: one line description for the dataset - # - schema: options = (source|bigbio_[bigbio_schema_name]) - # - subset_id: subset id is the canonical name for the dataset (eg. bioasq10b) - # where [bigbio_schema_name] = (kb, pairs, qa, text, t2t, entailment) - BUILDER_CONFIGS = [ BigBioConfig( - name="thomas2011_source", + name=f"{_DATASETNAME}_source", version=SOURCE_VERSION, description="Thomas et al 2011 source schema", schema="source", subset_id="thomas2011", ), BigBioConfig( - name="thomas2011_bigbio_kb", + name=f"{_DATASETNAME}_bigbio_kb", version=BIGBIO_VERSION, description="Thomas et al 2011 BigBio schema", schema="bigbio_kb", @@ -153,16 +134,10 @@ class Thomas2011Dataset(datasets.GeneratorBasedBuilder): ), ] - DEFAULT_CONFIG_NAME = "thomas2011_source" + DEFAULT_CONFIG_NAME = f"{_DATASETNAME}_source" def _info(self) -> datasets.DatasetInfo: - # Create the source schema; this schema will keep all keys/information/labels as close to the original dataset as possible. - # Much of this design is copied from biodatasets/verspoor_2013/verspoor_2013.py - - # You can arbitrarily nest lists and dictionaries. - # For iterables, use lists over tuples or `datasets.Sequence` - if self.config.schema == "source": features = datasets.Features( { @@ -188,54 +163,49 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - # Download gets entire git repo containing unused data from other datasets - # repo_dir = Path(dl_manager.download_and_extract(_URLS[0])) - # data_dir = repo_dir / "data" - # data_dir.mkdir(exist_ok=True) - - # Find the relevant files from Verspor2013 and move them to a new directory - # thomas2011_files = repo_dir.glob("*/*/*thomas2011/**/*") - # for file in thomas2011_files: - # if file.is_file() and "README" not in str(file): - # file.rename(data_dir / file.name) - - # Delete all unused files and directories from the original download - # for x in repo_dir.glob("[!data]*"): - # if x.is_file(): - # x.unlink() - # elif x.is_dir(): - # rmtree(x) - - data_dir = dl_manager.download_and_extract(_URLS[self.config.schema]) + data_dir = dl_manager.download_and_extract(_URLS[_DATASETNAME]) return [ datasets.SplitGenerator( - name=datasets.Split.TEST, + name=datasets.Split.TRAIN, # Whatever you put in gen_kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, "annotations.txt"), - "split": "test", }, ) ] - # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` - - # TODO: change the args of this function to match the keys in `gen_kwargs`. You may add any necessary kwargs. - def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]: + def get_clean_pubmed_abstract(self, id): + url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" + params = { + "db": "pubmed", + "id": id, + "retmode": "xml", + "rettype": "medline", + } + res = requests.get(url, params=params) + tree = ET.XML(res.text) + article = tree.find("PubmedArticle").find("MedlineCitation").find("Article") + article_title = article.find("ArticleTitle").text + " " + abstract_parts = [article_title] + article_abstract = article.find("Abstract").findall("AbstractText") + for abstract_part in article_abstract: + abstract_parts.append(abstract_part.text) + return "".join(abstract_parts) + + def _generate_examples(self, filepath: str) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - if split == "test": - data_ann = [] - with open(filepath, encoding="utf-8") as ann_tsv_file: - csv_reader_code = csv.reader( - ann_tsv_file, - quotechar="'", - delimiter="\t", - quoting=csv.QUOTE_ALL, - skipinitialspace=True, - ) - for id_, row in enumerate(csv_reader_code): - data_ann.append(row) + data_ann = [] + with open(filepath, encoding="utf-8") as ann_tsv_file: + csv_reader_code = csv.reader( + ann_tsv_file, + quotechar="'", + delimiter="\t", + quoting=csv.QUOTE_ALL, + skipinitialspace=True, + ) + for id_, row in enumerate(csv_reader_code): + data_ann.append(row) if self.config.schema == "source": for id_, row in enumerate(data_ann): @@ -259,8 +229,22 @@ def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]: ] df = pd.DataFrame(data_ann, columns=cols) uid = 0 + curr_count = 0 for id_ in df.doc_id.unique(): + curr_count += 1 + if curr_count == 3: + # The PubMed API limits 3 requests per second without an API key + time.sleep(0.5) + curr_count = 0 elist = [] + abstract_text = self.get_clean_pubmed_abstract(id_) + uid += 1 + passage = { + "id": uid, + "type": "", + "text": [abstract_text], + "offsets": [[0, len(abstract_text)]], + } for row in df.loc[df.doc_id == id_].itertuples(): uid += 1 if row.protein_or_nucleotide_sequence_mutation == "PSM": @@ -272,7 +256,7 @@ def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]: "id": str(uid), "type": ent_type, "text": [row.covered_text], - "offsets": [[int(row.off1), int(row.off2)]], + "offsets": [[int(row.off1) - 1, int(row.off2) - 1]], "normalized": [{"db_name": "dbSNP", "db_id": row.dbSNP_id}], } ) @@ -280,12 +264,8 @@ def _generate_examples(self, filepath: str, split: str) -> Tuple[int, Dict]: "id": id_, # uid is an unique identifier for every record that starts from 1 "document_id": str(row[0]), "entities": elist, - "passages": [], + "passages": [passage], "events": [], "coreferences": [], "relations": [], } - - -# This template is based on the following template from the datasets package: -# https://github.com/huggingface/datasets/blob/master/templates/new_dataset_script.py From 587cda456b3e53ee7d69b0f98e98544b5f5a45c2 Mon Sep 17 00:00:00 2001 From: shamikbose Date: Tue, 5 Jul 2022 12:56:37 -0400 Subject: [PATCH 2/3] Changes for building abstract Abstract is build as follows: `{title} {label}: {abstract.label}` --- .../thomas2011/download_asbtracts.py | 18 +++++++++++------- bigbio/biodatasets/thomas2011/thomas2011.py | 14 ++++++++++---- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/bigbio/biodatasets/thomas2011/download_asbtracts.py b/bigbio/biodatasets/thomas2011/download_asbtracts.py index fc2e6e7a..df25e8bb 100644 --- a/bigbio/biodatasets/thomas2011/download_asbtracts.py +++ b/bigbio/biodatasets/thomas2011/download_asbtracts.py @@ -2,7 +2,10 @@ import os import xml.etree.ElementTree as ET -for id in ["17563728", "17548681", "17566096"]: +for id in [ + "16338218", + "15645182", +]: url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" params = { "db": "pubmed", @@ -16,12 +19,13 @@ tree = ET.XML(res.text) article = tree.find("PubmedArticle").find("MedlineCitation").find("Article") article_title = article.find("ArticleTitle").text - abstract_parts = [article_title] + abstract_parts = [f"{article_title}"] article_abstract = article.find("Abstract").findall("AbstractText") for abstract_part in article_abstract: - print(abstract_part.attrib) - abstract_parts.append(abstract_part.text) - print( - f'PMID: {id}\nTitle: {abstract_parts[0]}\nAbstract: {"".join(abstract_parts[1:])}' - ) + label = abstract_part.attrib.get("Label", "") + if label: + abstract_parts.append(f"{label}:{abstract_part.text}") + else: + abstract_parts.append(abstract_part.text) + print(f'PMID: {id}\n{"".join(abstract_parts)}') print("|=" * 45) diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index b0e4745b..be39b47e 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -183,14 +183,20 @@ def get_clean_pubmed_abstract(self, id): "rettype": "medline", } res = requests.get(url, params=params) + blank_line_count = 0 + required_text_lines = [] tree = ET.XML(res.text) article = tree.find("PubmedArticle").find("MedlineCitation").find("Article") - article_title = article.find("ArticleTitle").text + " " - abstract_parts = [article_title] + article_title = article.find("ArticleTitle").text + abstract_parts = [f"{article_title}"] article_abstract = article.find("Abstract").findall("AbstractText") for abstract_part in article_abstract: - abstract_parts.append(abstract_part.text) - return "".join(abstract_parts) + label = abstract_part.attrib.get("Label", "") + if label: + abstract_parts.append(f"{label}: {abstract_part.text}") + else: + abstract_parts.append(abstract_part.text) + return " ".join(abstract_parts) def _generate_examples(self, filepath: str) -> Tuple[int, Dict]: From 4c7b8131a585c6969bd96cf4bea720c31507c0b3 Mon Sep 17 00:00:00 2001 From: shamikbose Date: Tue, 5 Jul 2022 17:46:39 -0400 Subject: [PATCH 3/3] Passes all tests Mismatched offsets in 7 examples, all others pass --- .../thomas2011/download_asbtracts.py | 31 ------------------- bigbio/biodatasets/thomas2011/thomas2011.py | 13 +++++--- 2 files changed, 8 insertions(+), 36 deletions(-) delete mode 100644 bigbio/biodatasets/thomas2011/download_asbtracts.py diff --git a/bigbio/biodatasets/thomas2011/download_asbtracts.py b/bigbio/biodatasets/thomas2011/download_asbtracts.py deleted file mode 100644 index df25e8bb..00000000 --- a/bigbio/biodatasets/thomas2011/download_asbtracts.py +++ /dev/null @@ -1,31 +0,0 @@ -import requests -import os -import xml.etree.ElementTree as ET - -for id in [ - "16338218", - "15645182", -]: - url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?" - params = { - "db": "pubmed", - "id": id, - "retmode": "xml", - "rettype": "medline", - } - res = requests.get(url, params=params) - blank_line_count = 0 - required_text_lines = [] - tree = ET.XML(res.text) - article = tree.find("PubmedArticle").find("MedlineCitation").find("Article") - article_title = article.find("ArticleTitle").text - abstract_parts = [f"{article_title}"] - article_abstract = article.find("Abstract").findall("AbstractText") - for abstract_part in article_abstract: - label = abstract_part.attrib.get("Label", "") - if label: - abstract_parts.append(f"{label}:{abstract_part.text}") - else: - abstract_parts.append(abstract_part.text) - print(f'PMID: {id}\n{"".join(abstract_parts)}') - print("|=" * 45) diff --git a/bigbio/biodatasets/thomas2011/thomas2011.py b/bigbio/biodatasets/thomas2011/thomas2011.py index be39b47e..e5c5734d 100644 --- a/bigbio/biodatasets/thomas2011/thomas2011.py +++ b/bigbio/biodatasets/thomas2011/thomas2011.py @@ -183,8 +183,6 @@ def get_clean_pubmed_abstract(self, id): "rettype": "medline", } res = requests.get(url, params=params) - blank_line_count = 0 - required_text_lines = [] tree = ET.XML(res.text) article = tree.find("PubmedArticle").find("MedlineCitation").find("Article") article_title = article.find("ArticleTitle").text @@ -196,7 +194,7 @@ def get_clean_pubmed_abstract(self, id): abstract_parts.append(f"{label}: {abstract_part.text}") else: abstract_parts.append(abstract_part.text) - return " ".join(abstract_parts) + return article_title, " ".join(abstract_parts) def _generate_examples(self, filepath: str) -> Tuple[int, Dict]: @@ -243,7 +241,7 @@ def _generate_examples(self, filepath: str) -> Tuple[int, Dict]: time.sleep(0.5) curr_count = 0 elist = [] - abstract_text = self.get_clean_pubmed_abstract(id_) + article_title, abstract_text = self.get_clean_pubmed_abstract(id_) uid += 1 passage = { "id": uid, @@ -251,18 +249,23 @@ def _generate_examples(self, filepath: str) -> Tuple[int, Dict]: "text": [abstract_text], "offsets": [[0, len(abstract_text)]], } + for row in df.loc[df.doc_id == id_].itertuples(): uid += 1 if row.protein_or_nucleotide_sequence_mutation == "PSM": ent_type = "Protein Sequence Mutation" else: ent_type = "Nucleotide Sequence Mutation" + tag_start, tag_end = int(row.off1), int(row.off2) + if tag_start > len(article_title): + tag_start -= 1 + tag_end -= 1 elist.append( { "id": str(uid), "type": ent_type, "text": [row.covered_text], - "offsets": [[int(row.off1) - 1, int(row.off2) - 1]], + "offsets": [[tag_start, tag_end]], "normalized": [{"db_name": "dbSNP", "db_id": row.dbSNP_id}], } )