Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closes #714 #721

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 151 additions & 45 deletions bigbio/biodatasets/jnlpba/jnlpba.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from typing import Dict, List, Tuple

import datasets

import os
import itertools
from bigbio.utils import schemas
from bigbio.utils.configs import BigBioConfig
from bigbio.utils.constants import Lang, Tasks
Expand All @@ -33,7 +34,6 @@
_PUBMED = True
_LOCAL = False

# TODO: Add BibTeX citation
_CITATION = """\
@inproceedings{collier-kim-2004-introduction,
title = "Introduction to the Bio-entity Recognition Task at {JNLPBA}",
Expand Down Expand Up @@ -61,21 +61,20 @@
_LICENSE = Licenses.CC_BY_3p0

_URLS = {
_DATASETNAME: "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz",
"train": "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Train/Genia4ERtraining.tar.gz",
"test": "http://www.nactem.ac.uk/GENIA/current/Shared-tasks/JNLPBA/Evaluation/Genia4ERtest.tar.gz",
}

# TODO: add supported task by dataset. One dataset may support multiple tasks
_SUPPORTED_TASKS = [
Tasks.NAMED_ENTITY_RECOGNITION
] # example: [Tasks.TRANSLATION, Tasks.NAMED_ENTITY_RECOGNITION, Tasks.RELATION_EXTRACTION]

# TODO: set this to a version that is associated with the dataset. if none exists use "1.0.0"
# This version doesn't have to be consistent with semantic versioning. Anything that is
# provided by the original dataset as a version goes.
_SOURCE_VERSION = "3.2.0"

_BIGBIO_VERSION = "1.0.0"

logger = datasets.utils.logging.get_logger(__name__)


class JNLPBADataset(datasets.GeneratorBasedBuilder):
"""
Expand Down Expand Up @@ -114,7 +113,29 @@ class JNLPBADataset(datasets.GeneratorBasedBuilder):
def _info(self) -> datasets.DatasetInfo:

if self.config.schema == "source":
features = datasets.load_dataset("jnlpba", split="train").features
features = datasets.Features(
{
"id": datasets.Value("string"),
"tokens": datasets.Sequence(datasets.Value("string")),
"ner_tags": datasets.Sequence(
datasets.features.ClassLabel(
names=[
"O",
"B-DNA",
"I-DNA",
"B-RNA",
"I-RNA",
"B-cell_line",
"I-cell_line",
"B-cell_type",
"I-cell_type",
"B-protein",
"I-protein",
]
)
),
}
)

elif self.config.schema == "bigbio_kb":
features = schemas.kb_features
Expand All @@ -129,54 +150,139 @@ def _info(self) -> datasets.DatasetInfo:

def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
"""Returns SplitGenerators."""
data = datasets.load_dataset("jnlpba")

train_filepath = dl_manager.download_and_extract(_URLS["train"])
test_filepath = dl_manager.download_and_extract(_URLS["test"])
train_file = os.path.join(train_filepath, "Genia4ERtask1.iob2")
test_file = os.path.join(test_filepath, "Genia4EReval1.iob2")
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
# Whatever you put in gen_kwargs will be passed to _generate_examples
gen_kwargs={"data": data["train"]},
gen_kwargs={"filepath": train_file},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={"data": data["validation"]},
name=datasets.Split.TEST,
gen_kwargs={"filepath": test_file},
),
]

def _generate_examples(self, data: datasets.Dataset) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
uid = 0
def _parse_sentence(self, tokens, ner_tags, uid):
"""
This function takes in two stacks, one with tokens and the other with tags
It returns the passage and the entities as required by the bigbio_kb schema
"""
entities = []
sentence_words = []
distance_from_back = -1
while tokens:
curr_token = tokens.pop()
ner_tag = ner_tags.pop()
distance_from_back += len(curr_token) + 1
sentence_words.append(curr_token)
if ner_tag.startswith("I-"):
# Keep popping elements until the next B-* tag is hit
tag_tokens = [curr_token]
curr_tag = ner_tag[2:]
while not ner_tag.startswith("B-"):
curr_token = tokens.pop()
ner_tag = ner_tags.pop()
distance_from_back += len(curr_token) + 1
sentence_words.append(curr_token)
tag_tokens.append(curr_token)
tag_text = " ".join(list(reversed(tag_tokens)))
tag_start = distance_from_back
tag_end = tag_start - len(tag_text)
entity = {
"id": next(uid),
"type": curr_tag,
"text": [tag_text],
"normalized": [],
"offsets": [[tag_start, tag_end]],
}
entities.append(entity)
elif ner_tag.startswith("B-"):
curr_tag = ner_tag[2:]
tag_start = distance_from_back
tag_end = tag_start - len(curr_token)
entity = {
"id": next(uid),
"type": curr_tag,
"text": [curr_token],
"normalized": [],
"offsets": [[tag_start, tag_end]],
}
entities.append(entity)
elif ner_tag == "O":
continue
passage = " ".join(list(reversed(sentence_words)))
for entity in entities:
entity_start = len(passage) - entity["offsets"][0][0]
entity_end = len(passage) - entity["offsets"][0][1]
entity["offsets"][0][1] = entity_end
entity["offsets"][0][0] = entity_start

document = {}
document["id"] = next(uid)
document["document_id"] = document["id"]
document["entities"] = entities
document["passages"] = [
{
"id": next(uid),
"type": "",
"text": [passage],
"offsets": [[0, len(passage)]],
}
]
document["relations"] = []
document["events"] = []
document["coreferences"] = []
return document["id"], document

def _generate_examples(self, filepath) -> Tuple[int, Dict]:
"""Yields examples as (key, example) tuples."""
logger.info(f"Generating examples from {filepath}")
uid = itertools.count(0)
if self.config.schema == "source":
for key, sample in enumerate(data):
yield key, sample
with open(filepath, encoding="utf-8") as f:
tokens = []
ner_tags = []
for line in f:
if line == "" or line == "\n":
if tokens:
id = next(uid)
yield id, {
"id": id,
"tokens": tokens,
"ner_tags": ner_tags,
}
next(uid)
tokens = []
ner_tags = []

elif self.config.schema == "bigbio_kb":
for i, sample in enumerate(data):
feature_dict = {
"id": uid,
"document_id": "NULL",
"passages": [],
"entities": [],
"relations": [],
"events": [],
"coreferences": [],
else:
# tokens are tab separated
splits = line.split("\t")
tokens.append(splits[0])
ner_tags.append(splits[1].rstrip())
# last example
id = next(uid)
yield id, {
"id": id,
"tokens": tokens,
"ner_tags": ner_tags,
}

uid += 1
offset_start = 0
for token, tag in zip(sample["tokens"], sample["ner_tags"]):
offset_start += len(token) + 1
feature_dict["entities"].append(
{
"id": uid,
"offsets": [[offset_start, offset_start + len(token)]],
"text": [token],
"type": tag,
"normalized": [],
}
)
uid += 1

# entities
yield i, feature_dict
elif self.config.schema == "bigbio_kb":
with open(filepath, encoding="utf-8") as f:
tokens = []
ner_tags = []
for line in f:
if line == "" or line == "\n":
document_id, document = self._parse_sentence(
tokens, ner_tags, uid
)
yield document_id, document
else:
token, tag = line.split("\t")
tokens.append(token.strip())
ner_tags.append(tag.strip())
24 changes: 11 additions & 13 deletions bigbio/hub/hub_repos/jnlpba/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,22 @@ bigbio_tasks:
- **Tasks:** NER


NER For Bio-Entities
The data came from the GENIA version 3.02 corpus (Kim et al., 2003).
This was formed from a controlled search on MEDLINE using the MeSH terms human, blood cells and transcription factors.
From this search 2,000 abstracts were selected and hand annotated according to a small taxonomy of 48 classes based on
a chemical classification. Among the classes, 36 terminal classes were used to annotate the GENIA corpus.



## Citation Information

```
@inproceedings{collier-kim-2004-introduction,
title = "Introduction to the Bio-entity Recognition Task at {JNLPBA}",
author = "Collier, Nigel and Kim, Jin-Dong",
booktitle = "Proceedings of the International Joint Workshop
on Natural Language Processing in Biomedicine and its Applications
({NLPBA}/{B}io{NLP})",
month = aug # " 28th and 29th", year = "2004",
address = "Geneva, Switzerland",
publisher = "COLING",
url = "https://aclanthology.org/W04-1213",
pages = "73--78",
@inproceedings{collier2004introduction,
title={Introduction to the bio-entity recognition task at JNLPBA},
author={Collier, Nigel and Ohta, Tomoko and Tsuruoka, Yoshimasa and Tateisi, Yuka and Kim, Jin-Dong},
booktitle={Proceedings of the International Joint Workshop on Natural Language Processing in Biomedicine \
and its Applications (NLPBA/BioNLP)},
pages={73--78},
year={2004}
}

```
Loading