From 87d75531df90d45b397cdc8ec3f259e9b5f04233 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Werner=20Gre=C3=9Fhoff?= Date: Mon, 10 Jun 2024 15:12:13 +0200 Subject: [PATCH] first implementation of OAIPMHReader (#329) * first implementation of OAIPMHReader * introduce a simple way to map OAI records to a dict without expecting a special metadata format. * fix installation requirements * fix tests * small fixes to make the tests run * add error handling * renamed oaipmh_scythe package * handle remarks/questions from review. * replaced access to a real OAI server with a mocking implementation. * Update invenio_vocabularies/datastreams/readers.py Co-authored-by: Pablo Tamarit * Update tests/datastreams/test_datastreams.py Co-authored-by: Pablo Tamarit * Moved reader tests to testreaders.py * add missing copyright in header --------- Co-authored-by: Pablo Tamarit --- invenio_vocabularies/config.py | 2 + invenio_vocabularies/datastreams/readers.py | 83 ++++++++++++ setup.cfg | 2 + tests/datastreams/test_readers.py | 133 ++++++++++++++++++++ 4 files changed, 220 insertions(+) diff --git a/invenio_vocabularies/config.py b/invenio_vocabularies/config.py index d2203667..789963a5 100644 --- a/invenio_vocabularies/config.py +++ b/invenio_vocabularies/config.py @@ -17,6 +17,7 @@ GzipReader, JsonLinesReader, JsonReader, + OAIPMHReader, TarReader, XMLReader, YamlReader, @@ -111,6 +112,7 @@ "yaml": YamlReader, "zip": ZipReader, "xml": XMLReader, + "oai-pmh": OAIPMHReader, } """Data Streams readers.""" diff --git a/invenio_vocabularies/datastreams/readers.py b/invenio_vocabularies/datastreams/readers.py index 313d0de0..bb2a7650 100644 --- a/invenio_vocabularies/datastreams/readers.py +++ b/invenio_vocabularies/datastreams/readers.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2021-2024 CERN. +# Copyright (C) 2024 University of Münster. # # Invenio-Vocabularies is free software; you can redistribute it and/or # modify it under the terms of the MIT License; see LICENSE file for more @@ -19,7 +20,11 @@ import requests import yaml +from lxml import etree from lxml.html import parse as html_parse +from oaipmh_scythe import Scythe +from oaipmh_scythe.exceptions import NoRecordsMatch +from oaipmh_scythe.models import Record from .errors import ReaderError from .xml import etree_to_dict @@ -226,3 +231,81 @@ def _iter(self, fp, *args, **kwargs): raise ReaderError(f"Record not found in XML entry.") yield record + + +class OAIPMHReader(BaseReader): + """OAIPMH reader.""" + + def __init__( + self, + *args, + base_url=None, + metadata_prefix=None, + set=None, + from_date=None, + until_date=None, + verb=None, + **kwargs, + ): + """Constructor.""" + self._base_url = base_url + self._metadata_prefix = metadata_prefix if not None else "oai_dc" + self._set = set + self._until = until_date + self._from = from_date + self._verb = verb if not None else "ListRecords" + super().__init__(*args, **kwargs) + + def _iter(self, scythe, *args, **kwargs): + """Read and parse an OAIPMH stream to dict.""" + scythe.class_mapping["ListRecords"] = self.OAIRecord + try: + records = scythe.list_records( + from_=self._from, + until=self._until, + metadata_prefix=self._metadata_prefix, + set_=self._set, + ignore_deleted=True, + ) + for record in records: + yield {"record": record} + except NoRecordsMatch: + raise ReaderError(f"No records found in OAI-PMH request.") + + def read(self, item=None, *args, **kwargs): + """Reads from item or opens the file descriptor from origin.""" + if item: + raise NotImplementedError( + "OAIPMHReader does not support being chained after another reader" + ) + else: + with Scythe(self._base_url) as scythe: + yield from self._iter(scythe=scythe, *args, **kwargs) + + class OAIRecord(Record): + """An XML unpacking implementation for more complicated formats.""" + + def get_metadata(self): + """Extract and return the record's metadata as a dictionary.""" + return xml_to_dict( + self.xml.find(".//" + self._oai_namespace + "metadata").getchildren()[ + 0 + ], + ) + + +def xml_to_dict(tree: etree._Element): + """Convert an XML tree to a dictionary. + + This function takes an XML element tree and converts it into a dictionary. + + Args: + tree: The root element of the XML tree to be converted. + + Returns: + A dictionary with the key "record". + """ + dict_obj = dict() + dict_obj["record"] = etree.tostring(tree) + + return dict_obj diff --git a/setup.cfg b/setup.cfg index 5fd098c5..a194808a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,12 +31,14 @@ install_requires = invenio-records-resources>=6.0.0,<7.0.0 lxml>=4.5.0 PyYAML>=5.4.1 + oaipmh-scythe @ git+https://github.com/ulbmuenster/invenio-oaipmh-scythe.git [options.extras_require] tests = pytest-black-ng>=0.4.0 invenio-app>=1.4.0,<2.0.0 invenio-db[postgresql,mysql]>=1.0.14,<2.0.0 + pytest_httpserver>=1.0.10 pytest-invenio>=2.1.0,<3.0.0 Sphinx>=4.5 elasticsearch7 = diff --git a/tests/datastreams/test_readers.py b/tests/datastreams/test_readers.py index 833ccc7e..bc72cd7e 100644 --- a/tests/datastreams/test_readers.py +++ b/tests/datastreams/test_readers.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2021-2024 CERN. +# Copyright (C) 2024 University of Münster. # # Invenio-Vocabularies is free software; you can redistribute it and/or # modify it under the terms of the MIT License; see LICENSE file for more @@ -16,8 +17,10 @@ import pytest import yaml +from invenio_vocabularies.datastreams.errors import ReaderError from invenio_vocabularies.datastreams.readers import ( JsonReader, + OAIPMHReader, TarReader, YamlReader, ZipReader, @@ -185,4 +188,134 @@ def test_json_element_reader(json_element_file, json_element): assert count == 1 +@pytest.fixture(scope="module") +def oai_response_match(): + response_data = """ + + + 2024-05-29T13:20:04Z + https://services.dnb.de/oai/repository + + +
+ oai:dnb.de/authorities:sachbegriff/1074025261 + 2024-01-01T16:51:21Z + authorities:sachbegriff +
+ + + 00000nz a2200000nc 4500 + 1074025261 + DE-101 + 20240101175121.0 + 150717n||azznnbabn | ana |c + + 1074025261 + http://d-nb.info/gnd/1074025261 + gnd + + + (DE-101)1074025261 + + + (DE-588)1074025261 + + + DE-12 + DE-12 + r:DE-384 + ger + 1210 + rswk + + + gnd1 + + + 31.3b + sswd + + + s + gndgen + + + saz + gndspec + + + g + s + + + Rundbogenhalle + + + Bogenhalle + + + (DE-101)040230236 + (DE-588)4023023-5 + https://d-nb.info/gnd/4023023-5 + Halle + obge + https://d-nb.info/standards/elementset/gnd#broaderTermGeneric + r + Oberbegriff generisch + + + Stahlbetonbauwerke mit großer Spannweite, eingesetzt im Industriebau, z.b.Paketposthalle München; teilweise heute unter Denkmalschutz + + + +
+ +
+ """ + return response_data + + +def test_oaipmh_reader(app, httpserver, oai_response_match): + httpserver.expect_request("/oai/repository").respond_with_data( + response_data=oai_response_match, mimetype="application/xml" + ) + reader = OAIPMHReader( + base_url=httpserver.url_for("/oai/repository"), + metadata_prefix="MARC21plus-1-xml", + set="authorities:sachbegriff", + from_date="2024-01-01T09:00:00Z", + until_date="2024-01-31T10:00:00Z", + ) + result = reader.read() + assert "record" in next(result) + + +@pytest.fixture(scope="module") +def oai_response_no_match(): + response_data = """ + + 2024-05-29T13:09:44Z + https://services.dnb.de/oai/repository + + + """ + return response_data + + +def test_oaipmh_reader_no_records_match(httpserver, oai_response_no_match): + httpserver.expect_request("/oai/repository").respond_with_data( + response_data=oai_response_no_match, mimetype="application/xml" + ) + reader = OAIPMHReader( + base_url=httpserver.url_for("/oai/repository"), + metadata_prefix="MARC21plus-1-xml", + set="authorities:sachbegriff", + from_date="2024-01-01T09:00:00Z", + until_date="2024-01-01T10:00:00Z", + ) + result = reader.read() + with pytest.raises(ReaderError): + next(result) + + # FIXME: add test for csv reader