From 8485320f0c7830e1fa805a8e6a12f208b418ce4a Mon Sep 17 00:00:00 2001 From: David Kartchner Date: Fri, 27 May 2022 08:43:58 -0600 Subject: [PATCH 1/6] add missing requirements file --- requirements.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..1dd4a8a3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +loguru==0.5.3 +bioc==2.0.post1 +numpy==1.21.2 +# pandas==1.3.3 +pybrat==0.1.4 +datasets==2.0.0 +black~=22.0 +flake8>=3.8.3 +isort>=5.0.0 +aiohttp==3.8.1 +openpyxl>=3.0.9 \ No newline at end of file From ac048204b75cf23e6ae5e1d370734f44b8a2c878 Mon Sep 17 00:00:00 2001 From: David_Kartchner Date: Thu, 4 Jan 2024 15:18:14 -0500 Subject: [PATCH 2/6] Add SourceData NLP NER/NED dataset --- bigbio/hub/hub_repos/sourcedata_nlp/README.md | 42 ++ .../hub/hub_repos/sourcedata_nlp/__init__.py | 0 .../hub/hub_repos/sourcedata_nlp/bigbiohub.py | 590 ++++++++++++++++++ .../sourcedata_nlp/sourcedata_nlp.py | 439 +++++++++++++ 4 files changed, 1071 insertions(+) create mode 100644 bigbio/hub/hub_repos/sourcedata_nlp/README.md create mode 100644 bigbio/hub/hub_repos/sourcedata_nlp/__init__.py create mode 100644 bigbio/hub/hub_repos/sourcedata_nlp/bigbiohub.py create mode 100644 bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/README.md b/bigbio/hub/hub_repos/sourcedata_nlp/README.md new file mode 100644 index 00000000..2d791dec --- /dev/null +++ b/bigbio/hub/hub_repos/sourcedata_nlp/README.md @@ -0,0 +1,42 @@ +--- +language: + - en +bigbio_language: + - English +license: "CC-BY 4.0" +bigbio_license_shortname: cc-by-4.0 +multilinguality: monolingual +pretty_name: SourceData NLP +homepage: https://sourcedata.embo.org/ +bigbio_pubmed: false +bigbio_public: true +bigbio_tasks: + - NAMED_ENTITY_RECOGNITION + - NAMED_ENTITY_DISAMBIGUATION +paperswithcode_id: sourcedata-nlp +--- + + +# Dataset Card for SourceData NLP + +## Dataset Description + +- **Homepage:** https://sourcedata.embo.org/ +- **Pubmed:** False +- **Public:** True +- **Tasks:** NER,NED + + +The SourceData-NLP is a named entity recognition and entity linking/disambiguation dataset produced through the routine curation of papers during the publication process. All annotations are in figure legends from published papers in molecular and cell biologyThe dataset consists of eight classes of biomedical entities (small molecules, gene products, subcellular components, cell lines, cell types, tissues, organisms, and diseases), their role in the experimental design, and the nature of the experimental method as an additional class. SourceData-NLP contains more than 620,000 annotated biomedical entities, curated from 18,689 figures in 3,223 papers in molecular and cell biology. + + +## Citation Information + +``` +@article{abreu2023sourcedata, + title={The SourceData-NLP dataset: integrating curation into scientific publishing for training large language models}, + author={Abreu-Vicente, Jorge and Sonntag, Hannah and Eidens, Thomas and Lemberger, Thomas}, + journal={arXiv preprint arXiv:2310.20440}, + year={2023} +} +``` \ No newline at end of file diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/__init__.py b/bigbio/hub/hub_repos/sourcedata_nlp/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/bigbiohub.py b/bigbio/hub/hub_repos/sourcedata_nlp/bigbiohub.py new file mode 100644 index 00000000..f4da7bb7 --- /dev/null +++ b/bigbio/hub/hub_repos/sourcedata_nlp/bigbiohub.py @@ -0,0 +1,590 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + if annotation_file.exists(): + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py b/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py new file mode 100644 index 00000000..58d2a820 --- /dev/null +++ b/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py @@ -0,0 +1,439 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +We present the SourceData-NLP dataset produced through the routine curation of papers during the publication process. +A unique feature of this dataset is its emphasis on the annotation of bioentities in figure legends. +We annotate eight classes of biomedical entities (small molecules, gene products, subcellular components, +cell lines, cell types, tissues, organisms, and diseases), their role in the experimental design, +and the nature of the experimental method as an additional class. +SourceData-NLP contains more than 620,000 annotated biomedical entities, curated from 18,689 figures in +3,223 papers in molecular and cell biology. + +[bigbio_schema_name] = kb +""" + +import itertools +import json +import os +from typing import Dict, List, Tuple + +import datasets + +from bigbiohub import BigBioConfig, Tasks, kb_features + +_LANGUAGES = ["English"] +_PUBMED = True +_LOCAL = False +_DISPLAYNAME = "SourceData-NLP" + +_CITATION = """\ +@article{abreu2023sourcedata, + title={The SourceData-NLP dataset: integrating curation into scientific publishing + for training large language models}, + author={Abreu-Vicente, Jorge and Sonntag, Hannah and Eidens, Thomas and Lemberger, Thomas}, + journal={arXiv preprint arXiv:2310.20440}, + year={2023} +} +""" + +_DATASETNAME = "sourcedata_nlp" + +_DESCRIPTION = """\ +SourceData is an NER/NED dataset of manual, expert annotations of nine +entity types in figure captions from biomedical research papers. +""" + +_HOMEPAGE = "https://sourcedata.embo.org/" + + +_LICENSE = "CC_BY_4p0" + + +_URLS = { + _DATASETNAME: "https://huggingface.co/datasets/EMBO/SourceData/resolve/main/bigbio/source_data_json_splits_2.0.2.zip" +} + + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_DISAMBIGUATION, Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "2.0.2" + +_BIGBIO_VERSION = "1.0.0" + + +class SourceDataNlpDataset(datasets.GeneratorBasedBuilder): + """NER + NED dataset of multiple entity types from figure captions of scientific publications""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="sourcedata_nlp_source", + version=SOURCE_VERSION, + description="sourcedata_nlp source schema", + schema="source", + subset_id="sourcedata_nlp", + ), + # BigBioConfig( + # name="sourcedata_nlp_roles_source", + # version=SOURCE_VERSION, + # description="sourcedata_nlp source schema", + # schema="source", + # subset_id="sourcedata_nlp", + # ), + BigBioConfig( + name="sourcedata_nlp_bigbio_kb", + version=BIGBIO_VERSION, + description="sourcedata_nlp BigBio schema", + schema="bigbio_kb", + subset_id="sourcedata_nlp", + ), + # BigBioConfig( + # name="sourcedata_nlp_roles_bigbio_kb", + # version=BIGBIO_VERSION, + # description="sourcedata_nlp BigBio schema for role labeling", + # schema="bigbio_kb", + # subset_id="sourcedata_nlp", + # ), + # BigBioConfig( + # name="sourcedata_nlp_gene_roles_bigbio_kb", + # version=BIGBIO_VERSION, + # description="sourcedata_nlp BigBio schema for role labeling", + # schema="bigbio_kb", + # subset_id="sourcedata_nlp", + # ), + # BigBioConfig( + # name="sourcedata_nlp_sm_roles_bigbio_kb", + # version=BIGBIO_VERSION, + # description="sourcedata_nlp BigBio schema for role labeling", + # schema="bigbio_kb", + # subset_id="sourcedata_nlp", + # ), + ] + + DEFAULT_CONFIG_NAME = "sourcedata_nlp_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "doi": datasets.Value("string"), + "abstract": datasets.Value("string"), + # "split": datasets.Value("string"), + "figures": [ + { + "fig_id": datasets.Value("string"), + "label": datasets.Value("string"), + "fig_graphic_url": datasets.Value("string"), + "panels": [ + { + "panel_id": datasets.Value("string"), + "text": datasets.Value("string"), + "panel_graphic_url": datasets.Value("string"), + "entities": [ + { + "annotation_id": datasets.Value("string"), + "source": datasets.Value("string"), + "category": datasets.Value("string"), + "entity_type": datasets.Value("string"), + "role": datasets.Value("string"), + "text": datasets.Value("string"), + "ext_ids": datasets.Value("string"), + "norm_text": datasets.Value("string"), + "ext_dbs": datasets.Value("string"), + "in_caption": datasets.Value("bool"), + "ext_names": datasets.Value("string"), + "ext_tax_ids": datasets.Value("string"), + "ext_tax_names": datasets.Value("string"), + "ext_urls": datasets.Value("string"), + "offsets": [datasets.Value("int64")], + } + ], + } + ], + } + ], + } + ) + + elif self.config.schema == "bigbio_kb": + features = kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": os.path.join(data_dir, "train.jsonl"), + }, + ), + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={ + "filepath": os.path.join(data_dir, "test.jsonl"), + }, + ), + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={ + "filepath": os.path.join(data_dir, "validation.jsonl"), + }, + ), + ] + + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + if self.config.schema == "source": + with open(filepath) as fstream: + no_panels = 0 + no_entities = 0 + has_panels = 0 + has_entities = 0 + for line in fstream: + document = self._parse_document(line) + # print(document["doi"]) + # print(document) + doc_figs = document["figures"] + all_figures = [] + for fig in doc_figs: + all_panels = [] + figure = { + "fig_id": fig["fig_id"], + "label": fig["label"], + "fig_graphic_url": fig["fig_graphic_url"], + } + # print(figure) + for p in fig["panels"]: + panel = { + "panel_id": p["panel_id"], + "text": p["text"].strip(), + "panel_graphic_url": p["panel_graphic_url"], + "entities": [ + { + "annotation_id": t["tag_id"], + "source": t["source"], + "category": t["category"], + "entity_type": t["entity_type"], + "role": t["role"], + "text": t["text"], + "ext_ids": t["ext_ids"], + "norm_text": t["norm_text"], + "ext_dbs": t["ext_dbs"], + "in_caption": bool(t["in_caption"]), + "ext_names": t["ext_names"], + "ext_tax_ids": t["ext_tax_ids"], + "ext_tax_names": t["ext_tax_names"], + "ext_urls": t["ext_urls"], + "offsets": t["local_offsets"], + # "document_offsets": [datasets.Value("int64")] + } + for t in p["tags"] + ], + } + for e in panel["entities"]: + assert type(e["offsets"]) == list + if len(panel["entities"]) == 0: + no_entities += 1 + continue + else: + has_entities += 1 + all_panels.append(panel) + # print(panel) + + figure["panels"] = all_panels + + # Pass on all figures that aren't split into panels + if len(all_panels) == 0: + # print(figure + no_panels += 1 + continue + else: + has_panels += 1 + all_figures.append(figure) + + # print(figure) + + output = { + "doi": document["doi"], + "abstract": document["abstract"], + "figures": all_figures, + } + yield document["doi"], output + # print() + # print(f"{has_panels=}") + # print(f"{has_entities=}") + # print(f"{no_panels=}") + # print(f"{no_entities=}") + + elif self.config.schema == "bigbio_kb": + uid = itertools.count(0) + # incorrect_length = 0 + # untyped_ents = 0 + # correct_length = 0 + + with open(filepath) as fstream: + for line in fstream: + output = {} + document = self._parse_document(line) + + # Get ids for each document + list of passages + output["id"] = next(uid) + output["document_id"] = document["doi"] + output["passages"] = document["passages"] + for i, passage in enumerate(output["passages"]): + passage["id"] = next(uid) + passage_text = passage["text"].strip() + passage["text"] = [passage_text] + passage_offsets = passage["offsets"] + if i == 0: + passage_offsets[1] = len(passage_text.strip()) + passage["offsets"] = [ + [ + passage_offsets[0], + passage_offsets[0] + passage_offsets[1], + ] + ] + # if passage_offsets[1] - passage_offsets[0] != len( + # passage["text"] + # ): + # incorrect_length += 1 + # else: + # correct_length += 1 + + # print(output["passages"]) + + # Parse out entities + entities = [] + for fig in document["figures"]: + for panel in fig["panels"]: + for tag in panel["tags"]: + # Create two separate ents if both role and tag are labeled. + ent_type = self._get_entity_type(tag) + if ent_type is not None: + ent = { + "id": next(uid), + "type": ent_type, + "text": [tag["text"]], + "offsets": [tag["document_offsets"]], + "normalized": [ + {"db_name": db_name, "db_id": db_id} + for db_name, db_id in zip( + tag["ext_dbs"], tag["ext_ids"] + ) + ], + } + # all_types.add(ent["type"]) + entities.append(ent) + + # When entity has a role as well, add an additional entity for this + # Necessary to create duplicate entity due to constraints of BigBio schema + # These can be consolidated by matching up document ID + offsets + role = self._get_entity_role(tag) + if role is not None: + role_ent = { + "id": next(uid), + "type": role, + "text": [tag["text"]], + "offsets": [tag["document_offsets"]], + "normalized": [ + {"db_name": db_name, "db_id": db_id} + for db_name, db_id in zip( + tag["ext_dbs"], tag["ext_ids"] + ) + ], + } + entities.append(role_ent) + + # if ent_type is None and role is None: + # untyped_ents += 1 + + output["entities"] = entities + # print(output['entities']) + + output["relations"] = [] + output["events"] = [] + output["coreferences"] = [] + + # print(output) + + yield output["document_id"], output + + # print() + # print(f"{correct_length =}") + # print(f"{incorrect_length =}") + # print(f"{untyped_ents =}") + # elif self.config.schema + + def _parse_document(self, raw_document): + doc = json.loads(raw_document.strip()) + return doc + + def _get_entity_type(self, tag): + if tag["entity_type"] == "molecule": + return "SMALL_MOLECULE" + elif tag["entity_type"] in ["geneprod", "gene", "protein"]: + return "GENEPROD" + elif tag["entity_type"] == "subcellular": + return "SUBCELLULAR" + elif tag["entity_type"] == "cell_type": + return "CELL_TYPE" + elif tag["entity_type"] == "tissue": + return "TISSUE" + elif tag["entity_type"] == "organism": + return "ORGANISM" + elif tag["category"] == "assay": + return "EXP_ASSAY" + elif tag["category"] == "disease": + return "DISEASE" + elif tag["entity_type"] == "cell_line": + return "CELL_LINE" + # else: + # if tag["role"] == "": + # print(tag["text"]) + + def _get_entity_role(self, tag): + if tag["entity_type"] == "molecule": + if tag["role"] == "intervention": + return "CONTROLLED_VAR" + elif tag["role"] == "assayed": + return "MEASURED_VAR" + elif tag["entity_type"] in ["geneprod", "gene", "protein"]: + if tag["role"] == "intervention": + return "CONTROLLED_VAR" + elif tag["role"] == "assayed": + return "MEASURED_VAR" + + +# This allows you to run your dataloader with `python [dataset_name].py` during development +# TODO: Remove this before making your PR +if __name__ == "__main__": + datasets.load_dataset(__file__, name="sourcedata_nlp_source") From f2cd5832fa2f00c15c9defe0f4090bfdfec0a9d6 Mon Sep 17 00:00:00 2001 From: David_Kartchner Date: Fri, 26 Jan 2024 11:22:39 -0500 Subject: [PATCH 3/6] revise description of Sourcedata_NLP dataset --- bigbio/hub/hub_repos/sourcedata_nlp/README.md | 2 +- bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/README.md b/bigbio/hub/hub_repos/sourcedata_nlp/README.md index 2d791dec..1ff24057 100644 --- a/bigbio/hub/hub_repos/sourcedata_nlp/README.md +++ b/bigbio/hub/hub_repos/sourcedata_nlp/README.md @@ -27,7 +27,7 @@ paperswithcode_id: sourcedata-nlp - **Tasks:** NER,NED -The SourceData-NLP is a named entity recognition and entity linking/disambiguation dataset produced through the routine curation of papers during the publication process. All annotations are in figure legends from published papers in molecular and cell biologyThe dataset consists of eight classes of biomedical entities (small molecules, gene products, subcellular components, cell lines, cell types, tissues, organisms, and diseases), their role in the experimental design, and the nature of the experimental method as an additional class. SourceData-NLP contains more than 620,000 annotated biomedical entities, curated from 18,689 figures in 3,223 papers in molecular and cell biology. +SourceData-NLP is a named entity recognition and entity linking/disambiguation dataset produced through the routine curation of papers during the publication process. All annotations are in figure legends from published papers in molecular and cell biologyThe dataset consists of eight classes of biomedical entities (small molecules, gene products, subcellular components, cell lines, cell types, tissues, organisms, and diseases), their role in the experimental design, and the nature of the experimental method as an additional class. SourceData-NLP contains more than 620,000 annotated biomedical entities, curated from 18,689 figures in 3,223 papers in molecular and cell biology. ## Citation Information diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py b/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py index 58d2a820..2816857d 100644 --- a/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py +++ b/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py @@ -52,7 +52,7 @@ _DATASETNAME = "sourcedata_nlp" _DESCRIPTION = """\ -SourceData is an NER/NED dataset of manual, expert annotations of nine +SourceData is an NER/NED dataset of expert annotations of nine entity types in figure captions from biomedical research papers. """ From 2f408313ed77b6d1e09dae235cb0363b4bcffc4d Mon Sep 17 00:00:00 2001 From: David_Kartchner Date: Fri, 26 Jan 2024 13:25:43 -0500 Subject: [PATCH 4/6] Closes #912 --- bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py b/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py index 2816857d..43b359f5 100644 --- a/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py +++ b/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py @@ -190,7 +190,6 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, - # Whatever you put in gen_kwargs will be passed to _generate_examples gen_kwargs={ "filepath": os.path.join(data_dir, "train.jsonl"), }, From 0af72412fe4af69417132e2570ca81ca5fbd9848 Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Mon, 21 Oct 2024 14:50:02 +0200 Subject: [PATCH 5/6] Fixed Formatting and Import Issues --- .../hub/hub_repos/sourcedata_nlp/__init__.py | 0 .../sourcedata_nlp/sourcedata_nlp.py | 101 ++---------------- requirements.txt | 5 + 3 files changed, 11 insertions(+), 95 deletions(-) delete mode 100644 bigbio/hub/hub_repos/sourcedata_nlp/__init__.py create mode 100644 requirements.txt diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/__init__.py b/bigbio/hub/hub_repos/sourcedata_nlp/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py b/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py index 43b359f5..82308f6b 100644 --- a/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py +++ b/bigbio/hub/hub_repos/sourcedata_nlp/sourcedata_nlp.py @@ -32,7 +32,7 @@ import datasets -from bigbiohub import BigBioConfig, Tasks, kb_features +from .bigbiohub import BigBioConfig, Tasks, kb_features _LANGUAGES = ["English"] _PUBMED = True @@ -63,7 +63,9 @@ _URLS = { - _DATASETNAME: "https://huggingface.co/datasets/EMBO/SourceData/resolve/main/bigbio/source_data_json_splits_2.0.2.zip" + _DATASETNAME: ( + "https://huggingface.co/datasets/EMBO/SourceData/resolve/main/bigbio/source_data_json_splits_2.0.2.zip" + ) } @@ -88,13 +90,6 @@ class SourceDataNlpDataset(datasets.GeneratorBasedBuilder): schema="source", subset_id="sourcedata_nlp", ), - # BigBioConfig( - # name="sourcedata_nlp_roles_source", - # version=SOURCE_VERSION, - # description="sourcedata_nlp source schema", - # schema="source", - # subset_id="sourcedata_nlp", - # ), BigBioConfig( name="sourcedata_nlp_bigbio_kb", version=BIGBIO_VERSION, @@ -102,27 +97,6 @@ class SourceDataNlpDataset(datasets.GeneratorBasedBuilder): schema="bigbio_kb", subset_id="sourcedata_nlp", ), - # BigBioConfig( - # name="sourcedata_nlp_roles_bigbio_kb", - # version=BIGBIO_VERSION, - # description="sourcedata_nlp BigBio schema for role labeling", - # schema="bigbio_kb", - # subset_id="sourcedata_nlp", - # ), - # BigBioConfig( - # name="sourcedata_nlp_gene_roles_bigbio_kb", - # version=BIGBIO_VERSION, - # description="sourcedata_nlp BigBio schema for role labeling", - # schema="bigbio_kb", - # subset_id="sourcedata_nlp", - # ), - # BigBioConfig( - # name="sourcedata_nlp_sm_roles_bigbio_kb", - # version=BIGBIO_VERSION, - # description="sourcedata_nlp BigBio schema for role labeling", - # schema="bigbio_kb", - # subset_id="sourcedata_nlp", - # ), ] DEFAULT_CONFIG_NAME = "sourcedata_nlp_source" @@ -133,7 +107,6 @@ def _info(self) -> datasets.DatasetInfo: { "doi": datasets.Value("string"), "abstract": datasets.Value("string"), - # "split": datasets.Value("string"), "figures": [ { "fig_id": datasets.Value("string"), @@ -213,14 +186,8 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: if self.config.schema == "source": with open(filepath) as fstream: - no_panels = 0 - no_entities = 0 - has_panels = 0 - has_entities = 0 for line in fstream: document = self._parse_document(line) - # print(document["doi"]) - # print(document) doc_figs = document["figures"] all_figures = [] for fig in doc_figs: @@ -230,7 +197,6 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: "label": fig["label"], "fig_graphic_url": fig["fig_graphic_url"], } - # print(figure) for p in fig["panels"]: panel = { "panel_id": p["panel_id"], @@ -253,7 +219,6 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: "ext_tax_names": t["ext_tax_names"], "ext_urls": t["ext_urls"], "offsets": t["local_offsets"], - # "document_offsets": [datasets.Value("int64")] } for t in p["tags"] ], @@ -261,43 +226,25 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: for e in panel["entities"]: assert type(e["offsets"]) == list if len(panel["entities"]) == 0: - no_entities += 1 continue - else: - has_entities += 1 all_panels.append(panel) - # print(panel) figure["panels"] = all_panels # Pass on all figures that aren't split into panels if len(all_panels) == 0: - # print(figure - no_panels += 1 continue - else: - has_panels += 1 all_figures.append(figure) - # print(figure) - output = { "doi": document["doi"], "abstract": document["abstract"], "figures": all_figures, } yield document["doi"], output - # print() - # print(f"{has_panels=}") - # print(f"{has_entities=}") - # print(f"{no_panels=}") - # print(f"{no_entities=}") elif self.config.schema == "bigbio_kb": uid = itertools.count(0) - # incorrect_length = 0 - # untyped_ents = 0 - # correct_length = 0 with open(filepath) as fstream: for line in fstream: @@ -321,16 +268,6 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: passage_offsets[0] + passage_offsets[1], ] ] - # if passage_offsets[1] - passage_offsets[0] != len( - # passage["text"] - # ): - # incorrect_length += 1 - # else: - # correct_length += 1 - - # print(output["passages"]) - - # Parse out entities entities = [] for fig in document["figures"]: for panel in fig["panels"]: @@ -345,12 +282,9 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: "offsets": [tag["document_offsets"]], "normalized": [ {"db_name": db_name, "db_id": db_id} - for db_name, db_id in zip( - tag["ext_dbs"], tag["ext_ids"] - ) + for db_name, db_id in zip(tag["ext_dbs"], tag["ext_ids"]) ], } - # all_types.add(ent["type"]) entities.append(ent) # When entity has a role as well, add an additional entity for this @@ -365,33 +299,19 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: "offsets": [tag["document_offsets"]], "normalized": [ {"db_name": db_name, "db_id": db_id} - for db_name, db_id in zip( - tag["ext_dbs"], tag["ext_ids"] - ) + for db_name, db_id in zip(tag["ext_dbs"], tag["ext_ids"]) ], } entities.append(role_ent) - # if ent_type is None and role is None: - # untyped_ents += 1 - output["entities"] = entities - # print(output['entities']) output["relations"] = [] output["events"] = [] output["coreferences"] = [] - # print(output) - yield output["document_id"], output - # print() - # print(f"{correct_length =}") - # print(f"{incorrect_length =}") - # print(f"{untyped_ents =}") - # elif self.config.schema - def _parse_document(self, raw_document): doc = json.loads(raw_document.strip()) return doc @@ -415,9 +335,6 @@ def _get_entity_type(self, tag): return "DISEASE" elif tag["entity_type"] == "cell_line": return "CELL_LINE" - # else: - # if tag["role"] == "": - # print(tag["text"]) def _get_entity_role(self, tag): if tag["entity_type"] == "molecule": @@ -430,9 +347,3 @@ def _get_entity_role(self, tag): return "CONTROLLED_VAR" elif tag["role"] == "assayed": return "MEASURED_VAR" - - -# This allows you to run your dataloader with `python [dataset_name].py` during development -# TODO: Remove this before making your PR -if __name__ == "__main__": - datasets.load_dataset(__file__, name="sourcedata_nlp_source") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..61864be6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +bioc==2.0.post4 +datasets>=2.8.0,<3.0.0 +numpy>=1.21.2 +openpyxl>=3.0.9,<3.1.0 +pandas>=1.3.3 \ No newline at end of file From 7a5b3e61cb6c7ad5800899394a9ccc1822e86e6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20S=C3=A4nger?= Date: Wed, 23 Oct 2024 12:19:31 +0200 Subject: [PATCH 6/6] fix: Fix license name in README.md --- bigbio/hub/hub_repos/sourcedata_nlp/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigbio/hub/hub_repos/sourcedata_nlp/README.md b/bigbio/hub/hub_repos/sourcedata_nlp/README.md index 1ff24057..0ffb2fe3 100644 --- a/bigbio/hub/hub_repos/sourcedata_nlp/README.md +++ b/bigbio/hub/hub_repos/sourcedata_nlp/README.md @@ -3,7 +3,7 @@ language: - en bigbio_language: - English -license: "CC-BY 4.0" +license: "cc-by-4.0" bigbio_license_shortname: cc-by-4.0 multilinguality: monolingual pretty_name: SourceData NLP @@ -39,4 +39,4 @@ SourceData-NLP is a named entity recognition and entity linking/disambiguation d journal={arXiv preprint arXiv:2310.20440}, year={2023} } -``` \ No newline at end of file +```