From 82457e667c2b233edae972817eefe18676fd6394 Mon Sep 17 00:00:00 2001 From: Florian Borchert Date: Wed, 23 Oct 2024 12:34:48 +0200 Subject: [PATCH] Add GraSCCo (closes #950) (#951) * GraSCCo PHI * remove future tasks * Remove future tasks * Removed sentences * Remove spaces --- bigbio/hub/hub_repos/grascco/README.md | 47 ++ bigbio/hub/hub_repos/grascco/bigbiohub.py | 590 ++++++++++++++++++++++ bigbio/hub/hub_repos/grascco/grascco.py | 228 +++++++++ 3 files changed, 865 insertions(+) create mode 100644 bigbio/hub/hub_repos/grascco/README.md create mode 100644 bigbio/hub/hub_repos/grascco/bigbiohub.py create mode 100644 bigbio/hub/hub_repos/grascco/grascco.py diff --git a/bigbio/hub/hub_repos/grascco/README.md b/bigbio/hub/hub_repos/grascco/README.md new file mode 100644 index 00000000..900c0b29 --- /dev/null +++ b/bigbio/hub/hub_repos/grascco/README.md @@ -0,0 +1,47 @@ +--- +language: + - de +bigbio_language: + - German +license: cc-by-4.0 +bigbio_license_shortname: CC_BY_4p0 +multilinguality: monolingual +pretty_name: GraSCCo +homepage: https://zenodo.org/records/6539131 +bigbio_pubmed: false +bigbio_public: true +bigbio_tasks: + - NAMED_ENTITY_RECOGNITION +--- + + +# Dataset Card for GraSCCo + +## Dataset Description + +- **Homepage:** https://zenodo.org/records/6539131 +- **Pubmed:** False +- **Public:** True +- **Tasks:** NER + +GraSCCo is a collection of artificially generated semi-structured and unstructured German-language clinical summaries. These summaries are formulated as letters from the hospital to the patient's GP after in-patient or out-patient care. +This is common practice in Germany, Austria and Switzerland. + +The creation of the GraSCCo documents were inspired by existing clinical texts, but all names and dates are purely fictional. +There is no relation to existing patients, clinicians or institutions. Whereas the texts try to represent the range of German clinical language as best as possible, medical plausibility must not be assumed. + +GraSCCo can therefore only be used to train clinical language models, not clinical domain models. + + +## Citation Information + +``` +@incollection{modersohn2022grascco, + title={GRASCCO—The First Publicly Shareable, Multiply-Alienated German Clinical Text Corpus}, + author={Modersohn, Luise and Schulz, Stefan and Lohr, Christina and Hahn, Udo}, + booktitle={German Medical Data Sciences 2022--Future Medicine: More Precise, More Integrative, More Sustainable!}, + pages={66--72}, + year={2022}, + publisher={IOS Press} +} +``` diff --git a/bigbio/hub/hub_repos/grascco/bigbiohub.py b/bigbio/hub/hub_repos/grascco/bigbiohub.py new file mode 100644 index 00000000..f4da7bb7 --- /dev/null +++ b/bigbio/hub/hub_repos/grascco/bigbiohub.py @@ -0,0 +1,590 @@ +from collections import defaultdict +from dataclasses import dataclass +from enum import Enum +import logging +from pathlib import Path +from types import SimpleNamespace +from typing import TYPE_CHECKING, Dict, Iterable, List, Tuple + +import datasets + +if TYPE_CHECKING: + import bioc + +logger = logging.getLogger(__name__) + + +BigBioValues = SimpleNamespace(NULL="") + + +@dataclass +class BigBioConfig(datasets.BuilderConfig): + """BuilderConfig for BigBio.""" + + name: str = None + version: datasets.Version = None + description: str = None + schema: str = None + subset_id: str = None + + +class Tasks(Enum): + NAMED_ENTITY_RECOGNITION = "NER" + NAMED_ENTITY_DISAMBIGUATION = "NED" + EVENT_EXTRACTION = "EE" + RELATION_EXTRACTION = "RE" + COREFERENCE_RESOLUTION = "COREF" + QUESTION_ANSWERING = "QA" + TEXTUAL_ENTAILMENT = "TE" + SEMANTIC_SIMILARITY = "STS" + TEXT_PAIRS_CLASSIFICATION = "TXT2CLASS" + PARAPHRASING = "PARA" + TRANSLATION = "TRANSL" + SUMMARIZATION = "SUM" + TEXT_CLASSIFICATION = "TXTCLASS" + + +entailment_features = datasets.Features( + { + "id": datasets.Value("string"), + "premise": datasets.Value("string"), + "hypothesis": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +pairs_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "label": datasets.Value("string"), + } +) + +qa_features = datasets.Features( + { + "id": datasets.Value("string"), + "question_id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "question": datasets.Value("string"), + "type": datasets.Value("string"), + "choices": [datasets.Value("string")], + "context": datasets.Value("string"), + "answer": datasets.Sequence(datasets.Value("string")), + } +) + +text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "labels": [datasets.Value("string")], + } +) + +text2text_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text_1": datasets.Value("string"), + "text_2": datasets.Value("string"), + "text_1_name": datasets.Value("string"), + "text_2_name": datasets.Value("string"), + } +) + +kb_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "passages": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + } + ], + "entities": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + "events": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + # refers to the text_bound_annotation of the trigger + "trigger": { + "text": datasets.Sequence(datasets.Value("string")), + "offsets": datasets.Sequence([datasets.Value("int32")]), + }, + "arguments": [ + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ], + } + ], + "coreferences": [ + { + "id": datasets.Value("string"), + "entity_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "relations": [ + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arg1_id": datasets.Value("string"), + "arg2_id": datasets.Value("string"), + "normalized": [ + { + "db_name": datasets.Value("string"), + "db_id": datasets.Value("string"), + } + ], + } + ], + } +) + + +TASK_TO_SCHEMA = { + Tasks.NAMED_ENTITY_RECOGNITION.name: "KB", + Tasks.NAMED_ENTITY_DISAMBIGUATION.name: "KB", + Tasks.EVENT_EXTRACTION.name: "KB", + Tasks.RELATION_EXTRACTION.name: "KB", + Tasks.COREFERENCE_RESOLUTION.name: "KB", + Tasks.QUESTION_ANSWERING.name: "QA", + Tasks.TEXTUAL_ENTAILMENT.name: "TE", + Tasks.SEMANTIC_SIMILARITY.name: "PAIRS", + Tasks.TEXT_PAIRS_CLASSIFICATION.name: "PAIRS", + Tasks.PARAPHRASING.name: "T2T", + Tasks.TRANSLATION.name: "T2T", + Tasks.SUMMARIZATION.name: "T2T", + Tasks.TEXT_CLASSIFICATION.name: "TEXT", +} + +SCHEMA_TO_TASKS = defaultdict(set) +for task, schema in TASK_TO_SCHEMA.items(): + SCHEMA_TO_TASKS[schema].add(task) +SCHEMA_TO_TASKS = dict(SCHEMA_TO_TASKS) + +VALID_TASKS = set(TASK_TO_SCHEMA.keys()) +VALID_SCHEMAS = set(TASK_TO_SCHEMA.values()) + +SCHEMA_TO_FEATURES = { + "KB": kb_features, + "QA": qa_features, + "TE": entailment_features, + "T2T": text2text_features, + "TEXT": text_features, + "PAIRS": pairs_features, +} + + +def get_texts_and_offsets_from_bioc_ann(ann: "bioc.BioCAnnotation") -> Tuple: + + offsets = [(loc.offset, loc.offset + loc.length) for loc in ann.locations] + + text = ann.text + + if len(offsets) > 1: + i = 0 + texts = [] + for start, end in offsets: + chunk_len = end - start + texts.append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + texts = [text] + + return offsets, texts + + +def remove_prefix(a: str, prefix: str) -> str: + if a.startswith(prefix): + a = a[len(prefix) :] + return a + + +def parse_brat_file( + txt_file: Path, + annotation_file_suffixes: List[str] = None, + parse_notes: bool = False, +) -> Dict: + """ + Parse a brat file into the schema defined below. + `txt_file` should be the path to the brat '.txt' file you want to parse, e.g. 'data/1234.txt' + Assumes that the annotations are contained in one or more of the corresponding '.a1', '.a2' or '.ann' files, + e.g. 'data/1234.ann' or 'data/1234.a1' and 'data/1234.a2'. + Will include annotator notes, when `parse_notes == True`. + brat_features = datasets.Features( + { + "id": datasets.Value("string"), + "document_id": datasets.Value("string"), + "text": datasets.Value("string"), + "text_bound_annotations": [ # T line in brat, e.g. type or event trigger + { + "offsets": datasets.Sequence([datasets.Value("int32")]), + "text": datasets.Sequence(datasets.Value("string")), + "type": datasets.Value("string"), + "id": datasets.Value("string"), + } + ], + "events": [ # E line in brat + { + "trigger": datasets.Value( + "string" + ), # refers to the text_bound_annotation of the trigger, + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "arguments": datasets.Sequence( + { + "role": datasets.Value("string"), + "ref_id": datasets.Value("string"), + } + ), + } + ], + "relations": [ # R line in brat + { + "id": datasets.Value("string"), + "head": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "tail": { + "ref_id": datasets.Value("string"), + "role": datasets.Value("string"), + }, + "type": datasets.Value("string"), + } + ], + "equivalences": [ # Equiv line in brat + { + "id": datasets.Value("string"), + "ref_ids": datasets.Sequence(datasets.Value("string")), + } + ], + "attributes": [ # M or A lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "value": datasets.Value("string"), + } + ], + "normalizations": [ # N lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "resource_name": datasets.Value( + "string" + ), # Name of the resource, e.g. "Wikipedia" + "cuid": datasets.Value( + "string" + ), # ID in the resource, e.g. 534366 + "text": datasets.Value( + "string" + ), # Human readable description/name of the entity, e.g. "Barack Obama" + } + ], + ### OPTIONAL: Only included when `parse_notes == True` + "notes": [ # # lines in brat + { + "id": datasets.Value("string"), + "type": datasets.Value("string"), + "ref_id": datasets.Value("string"), + "text": datasets.Value("string"), + } + ], + }, + ) + """ + + example = {} + example["document_id"] = txt_file.with_suffix("").name + with txt_file.open() as f: + example["text"] = f.read() + + # If no specific suffixes of the to-be-read annotation files are given - take standard suffixes + # for event extraction + if annotation_file_suffixes is None: + annotation_file_suffixes = [".a1", ".a2", ".ann"] + + if len(annotation_file_suffixes) == 0: + raise AssertionError( + "At least one suffix for the to-be-read annotation files should be given!" + ) + + ann_lines = [] + for suffix in annotation_file_suffixes: + annotation_file = txt_file.with_suffix(suffix) + if annotation_file.exists(): + with annotation_file.open() as f: + ann_lines.extend(f.readlines()) + + example["text_bound_annotations"] = [] + example["events"] = [] + example["relations"] = [] + example["equivalences"] = [] + example["attributes"] = [] + example["normalizations"] = [] + + if parse_notes: + example["notes"] = [] + + for line in ann_lines: + line = line.strip() + if not line: + continue + + if line.startswith("T"): # Text bound + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + ann["offsets"] = [] + span_str = remove_prefix(fields[1], (ann["type"] + " ")) + text = fields[2] + for span in span_str.split(";"): + start, end = span.split() + ann["offsets"].append([int(start), int(end)]) + + # Heuristically split text of discontiguous entities into chunks + ann["text"] = [] + if len(ann["offsets"]) > 1: + i = 0 + for start, end in ann["offsets"]: + chunk_len = end - start + ann["text"].append(text[i : chunk_len + i]) + i += chunk_len + while i < len(text) and text[i] == " ": + i += 1 + else: + ann["text"] = [text] + + example["text_bound_annotations"].append(ann) + + elif line.startswith("E"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + ann["type"], ann["trigger"] = fields[1].split()[0].split(":") + + ann["arguments"] = [] + for role_ref_id in fields[1].split()[1:]: + argument = { + "role": (role_ref_id.split(":"))[0], + "ref_id": (role_ref_id.split(":"))[1], + } + ann["arguments"].append(argument) + + example["events"].append(ann) + + elif line.startswith("R"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["type"] = fields[1].split()[0] + + ann["head"] = { + "role": fields[1].split()[1].split(":")[0], + "ref_id": fields[1].split()[1].split(":")[1], + } + ann["tail"] = { + "role": fields[1].split()[2].split(":")[0], + "ref_id": fields[1].split()[2].split(":")[1], + } + + example["relations"].append(ann) + + # '*' seems to be the legacy way to mark equivalences, + # but I couldn't find any info on the current way + # this might have to be adapted dependent on the brat version + # of the annotation + elif line.startswith("*"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["ref_ids"] = fields[1].split()[1:] + + example["equivalences"].append(ann) + + elif line.startswith("A") or line.startswith("M"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + + info = fields[1].split() + ann["type"] = info[0] + ann["ref_id"] = info[1] + + if len(info) > 2: + ann["value"] = info[2] + else: + ann["value"] = "" + + example["attributes"].append(ann) + + elif line.startswith("N"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + ann["resource_name"] = info[2].split(":")[0] + ann["cuid"] = info[2].split(":")[1] + example["normalizations"].append(ann) + + elif parse_notes and line.startswith("#"): + ann = {} + fields = line.split("\t") + + ann["id"] = fields[0] + ann["text"] = fields[2] if len(fields) == 3 else BigBioValues.NULL + + info = fields[1].split() + + ann["type"] = info[0] + ann["ref_id"] = info[1] + example["notes"].append(ann) + + return example + + +def brat_parse_to_bigbio_kb(brat_parse: Dict) -> Dict: + """ + Transform a brat parse (conforming to the standard brat schema) obtained with + `parse_brat_file` into a dictionary conforming to the `bigbio-kb` schema (as defined in ../schemas/kb.py) + :param brat_parse: + """ + + unified_example = {} + + # Prefix all ids with document id to ensure global uniqueness, + # because brat ids are only unique within their document + id_prefix = brat_parse["document_id"] + "_" + + # identical + unified_example["document_id"] = brat_parse["document_id"] + unified_example["passages"] = [ + { + "id": id_prefix + "_text", + "type": "abstract", + "text": [brat_parse["text"]], + "offsets": [[0, len(brat_parse["text"])]], + } + ] + + # get normalizations + ref_id_to_normalizations = defaultdict(list) + for normalization in brat_parse["normalizations"]: + ref_id_to_normalizations[normalization["ref_id"]].append( + { + "db_name": normalization["resource_name"], + "db_id": normalization["cuid"], + } + ) + + # separate entities and event triggers + unified_example["events"] = [] + non_event_ann = brat_parse["text_bound_annotations"].copy() + for event in brat_parse["events"]: + event = event.copy() + event["id"] = id_prefix + event["id"] + trigger = next( + tr + for tr in brat_parse["text_bound_annotations"] + if tr["id"] == event["trigger"] + ) + if trigger in non_event_ann: + non_event_ann.remove(trigger) + event["trigger"] = { + "text": trigger["text"].copy(), + "offsets": trigger["offsets"].copy(), + } + for argument in event["arguments"]: + argument["ref_id"] = id_prefix + argument["ref_id"] + + unified_example["events"].append(event) + + unified_example["entities"] = [] + anno_ids = [ref_id["id"] for ref_id in non_event_ann] + for ann in non_event_ann: + entity_ann = ann.copy() + entity_ann["id"] = id_prefix + entity_ann["id"] + entity_ann["normalized"] = ref_id_to_normalizations[ann["id"]] + unified_example["entities"].append(entity_ann) + + # massage relations + unified_example["relations"] = [] + skipped_relations = set() + for ann in brat_parse["relations"]: + if ( + ann["head"]["ref_id"] not in anno_ids + or ann["tail"]["ref_id"] not in anno_ids + ): + skipped_relations.add(ann["id"]) + continue + unified_example["relations"].append( + { + "arg1_id": id_prefix + ann["head"]["ref_id"], + "arg2_id": id_prefix + ann["tail"]["ref_id"], + "id": id_prefix + ann["id"], + "type": ann["type"], + "normalized": [], + } + ) + if len(skipped_relations) > 0: + example_id = brat_parse["document_id"] + logger.info( + f"Example:{example_id}: The `bigbio_kb` schema allows `relations` only between entities." + f" Skip (for now): " + f"{list(skipped_relations)}" + ) + + # get coreferences + unified_example["coreferences"] = [] + for i, ann in enumerate(brat_parse["equivalences"], start=1): + is_entity_cluster = True + for ref_id in ann["ref_ids"]: + if not ref_id.startswith("T"): # not textbound -> no entity + is_entity_cluster = False + elif ref_id not in anno_ids: # event trigger -> no entity + is_entity_cluster = False + if is_entity_cluster: + entity_ids = [id_prefix + i for i in ann["ref_ids"]] + unified_example["coreferences"].append( + {"id": id_prefix + str(i), "entity_ids": entity_ids} + ) + return unified_example diff --git a/bigbio/hub/hub_repos/grascco/grascco.py b/bigbio/hub/hub_repos/grascco/grascco.py new file mode 100644 index 00000000..47c1c438 --- /dev/null +++ b/bigbio/hub/hub_repos/grascco/grascco.py @@ -0,0 +1,228 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +GraSCCo is a collection of artificially generated semi-structured and unstructured German-language clinical summaries. +These summaries are formulated as letters from the hospital to the patient's GP after in-patient or out-patient care. +This is common practice in Germany, Austria and Switzerland. + +The creation of the GraSCCo documents were inspired by existing clinical texts, +but all names and dates are purely fictional. +There is no relation to existing patients, clinicians or institutions. +Whereas the texts try to represent the range of German clinical language as best as possible, +medical plausibility must not be assumed. + +GraSCCo can therefore only be used to train clinical language models, not clinical domain models. +""" + +import json +from pathlib import Path +from typing import Dict, List, Tuple + +import datasets + +from .bigbiohub import BigBioConfig, Tasks, kb_features, logger + +_LOCAL = False + +_CITATION = """\ +@incollection{modersohn2022grascco, + title={GRASCCO—The First Publicly Shareable, Multiply-Alienated German Clinical Text Corpus}, + author={Modersohn, Luise and Schulz, Stefan and Lohr, Christina and Hahn, Udo}, + booktitle={German Medical Data Sciences 2022--Future Medicine: More Precise, More Integrative, More Sustainable!}, + pages={66--72}, + year={2022}, + publisher={IOS Press} +} +""" + +_DATASETNAME = "grascco" + +_DISPLAYNAME = "GraSCCo" + +_DESCRIPTION = """\ +GraSCCo is a collection of artificially generated semi-structured and unstructured German-language clinical summaries. +These summaries are formulated as letters from the hospital to the patient's GP after in-patient or out-patient care. +This is common practice in Germany, Austria and Switzerland. + +The creation of the GraSCCo documents were inspired by existing clinical texts, +but all names and dates are purely fictional. +There is no relation to existing patients, clinicians or institutions. +Whereas the texts try to represent the range of German clinical language as best as possible, +medical plausibility must not be assumed. + +GraSCCo can therefore only be used to train clinical language models, not clinical domain models. +""" + +_HOMEPAGE = "https://zenodo.org/records/6539131" + +_LICENSE = "CC_BY_4p0" + +_LANGUAGES = ["German"] + +_PUBMED = False + +_URLS = { + _DATASETNAME: { + "phi": "https://zenodo.org/records/11502329/files/grascco_phi_annotation_json.zip?download=1", + }, +} + +_SUPPORTED_TASKS = [Tasks.NAMED_ENTITY_RECOGNITION] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + +_UIMA_FEATURES_KEY = "%FEATURE_STRUCTURES" + + +class GraSCCoDataset(datasets.GeneratorBasedBuilder): + """Dataloader for GraSCCo dataset with different annotation layers (PHI, SNOMED CT, etc.)""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [ + BigBioConfig( + name="grascco_phi_source", + version=SOURCE_VERSION, + description="GraSCCo (PHI) source schema", + schema="source", + subset_id="phi", + ), + BigBioConfig( + name="grascco_phi_bigbio_kb", + version=BIGBIO_VERSION, + description="GraSCCo (PHI) BigBio schema", + schema="bigbio_kb", + subset_id="phi", + ), + ] + + DEFAULT_CONFIG_NAME = "grascco_phi_source" + + def _info(self) -> datasets.DatasetInfo: + if self.config.schema == "source": + features = datasets.Features( + { + "document_id": datasets.Value("string"), + _UIMA_FEATURES_KEY: [ + { + "%ID": datasets.Value("int64"), + "%TYPE": datasets.Value("string"), + "@sofa": datasets.Value("int64"), + "@layer": datasets.Value("int64"), + "begin": datasets.Value("int64"), + "end": datasets.Value("int64"), + "name": datasets.Value("string"), + "uiName": datasets.Value("string"), + "documentTitle": datasets.Value("string"), + "sofaString": datasets.Value("string"), + } + ], + } + ) + + elif self.config.schema == "bigbio_kb": + features = kb_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME][self.config.subset_id] + data_dir = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + # Whatever you put in gen_kwargs will be passed to _generate_examples + gen_kwargs={ + "filepath": Path(data_dir) / "grascco_phi_annotation_json", + }, + ), + ] + + def _parse_uima_cas_json(self, filename) -> Dict: + """Parse UIMA CAS JSON file and return parsed elements as well as the raw data""" + with open(filename, "r", encoding="utf-8") as f: + uima_features = json.load(f)[_UIMA_FEATURES_KEY] + phi_elements = [] + for feature in uima_features: + if feature["%TYPE"] == "webanno.custom.PHI": + phi_elements.append(feature) + if feature["%TYPE"] == "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData": + document_title = feature["documentTitle"] + if feature["%TYPE"] == "uima.cas.Sofa": + document_text = feature["sofaString"] + return { + "phi_elements": phi_elements, + "document_title": document_title, + "document_text": document_text, + "uima_features": uima_features, + } + + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + for file_id, file in enumerate(sorted(filepath.glob("*.json"))): + uima_parsed = self._parse_uima_cas_json(file) + doc_id = uima_parsed["document_title"] + if self.config.schema == "source": + yield doc_id, {"document_id": doc_id, _UIMA_FEATURES_KEY: uima_parsed["uima_features"]} + elif self.config.schema == "bigbio_kb": + text = uima_parsed["document_text"] + relations = [] + entities = [] + # Just as single passage; ignoring sentence boundaries from annotation tool, as these are not reliable + passages = [{"id": f"{file_id}-0", "type": "document", "text": [text], "offsets": [[0, len(text)]]}] + + # Other subsets / annotation layers will be added in future GraSCCo releases + if self.config.subset_id == "phi": + for phi in sorted(uima_parsed["phi_elements"], key=lambda p: p["begin"]): + e_start = phi["begin"] + e_end = phi["end"] + eid = phi["%ID"] + if "kind" not in phi: + logger.warning( + f"'kind' attribute missing in PHI element with ID {eid} in document {doc_id}" + ) + continue + entities.append( + { + "id": f"{file_id}-{eid}", + "type": phi["kind"], + "text": [text[e_start:e_end]], + "offsets": [[e_start, e_end]], + "normalized": [], + } + ) + + yield doc_id, { + "id": file_id, + "document_id": doc_id, + "passages": passages, + "entities": entities, + "events": [], + "coreferences": [], + "relations": relations, + }