Merge pull request #33 from joshmoore/metadata-rework

Metadata rework
ome · Aug 22, 2024 · 766aaca · 766aaca
2 parents 7e06529 + 043aef2
commit 766aaca
Show file tree

Hide file tree

Showing 13 changed files with 737 additions and 429 deletions.
diff --git a/README.md b/README.md
@@ -136,43 +136,70 @@ $ ome2024-ngff-challenge --input-bucket=bia-integrator-data --input-endpoint=htt
 
  </details>
 
-## Converting your data
+## CLI Commands
+
+### `resave`: convert your data
 
 The `ome2024-ngff-challenge` tool can be used to convert an OME-Zarr 0.4 dataset
 that is based on Zarr v2. The input data will **not be modified** in any way and
 a full copy of the data will be created at the chosen location.
 
-### Getting started
+#### Getting started
 
 ```
-ome2024-ngff-challenge input.zarr output.zarr
+ome2024-ngff-challenge resave --cc-by input.zarr output.zarr
+```
+
+is the most basic invocation of the tool. If you do not choose a license, the
+application will fail with:
+
 ```
+No license set. Choose one of the Creative Commons license (e.g., `--cc-by`) or skip RO-Crate creation (`--rocrate-skip`)
+```
+
+#### Licenses
+
+There are a number of other license options to choose from. We suggest one of:
+
+- `--cc-by` credit must be given to the creator
+- `--cc0`: Add your data to the public domain
+
+Alternatively, you can choose your own license, e.g.,
+
+`--rocrate-license=https://creativecommons.org/licenses/by-nc/4.0/`
+
+to restrict commercial use of your data. Additionally, you can disable metadata
+collection at all.
+
+**Note:** you will need to add metadata later for your dataset to be considered
+valid.
+
+#### Re-running the script
 
-is the most basic invocation of the tool. If you would like to re-run the script
-with different parameters, you can additionally set `--output-overwrite` to
-ignore a previous conversion:
+If you would like to re-run the script with different parameters, you can
+additionally set `--output-overwrite` to ignore a previous conversion:
 
 ```
-ome2024-ngff-challenge input.zarr output.zarr --output-overwrite
+ome2024-ngff-challenge resave --cc-by input.zarr output.zarr --output-overwrite
 ```
 
-### Writing in parallel
+#### Writing in parallel
 
 By default, 16 chunks of data will be processed simultaneously in order to bound
 memory usage. You can increase this number based on your local resources:
 
 ```
-ome2024-ngff-challenge input.zarr output.zarr --output-threads=128
+ome2024-ngff-challenge resave --cc-by input.zarr output.zarr --output-threads=128
 ```
 
-### Reading/writing remotely
+#### Reading/writing remotely
 
 If you would like to avoid downloading and/or upload the Zarr datasets, you can
 set S3 parameters on the command-line which will then treat the input and/or
 output datasets as a prefix within an S3 bucket:
 
 ```
-ome2024-ngff-challenge \
+ome2024-ngff-challenge resave --cc-by \
         --input-bucket=BUCKET \
         --input-endpoint=HOST \
         --input-anon \
@@ -183,15 +210,15 @@ ome2024-ngff-challenge \
 A small example you can try yourself:
 
 ```
-ome2024-ngff-challenge \
+ome2024-ngff-challenge resave --cc-by \
         --input-bucket=idr \
         --input-endpoint=https://uk1s3.embassy.ebi.ac.uk \
         --input-anon \
         zarr/v0.4/idr0062A/6001240.zarr \
         /tmp/6001240.zarr
 ```
 
-### Reading/writing via a script
+#### Reading/writing via a script
 
 Another R/W option is to have `resave.py` generate a script which you can
 execute later. If you pass `--output-script`, then rather than generate the
@@ -201,7 +228,7 @@ executed later.
 For example, running:
 
 ```
-ome2024-ngff-challenge dev2/input.zarr /tmp/scripts.zarr --output-script
+ome2024-ngff-challenge resave --cc-by dev2/input.zarr /tmp/scripts.zarr --output-script
 ```
 
 produces a dataset with one `zarr.json` file and 3 `convert.sh` scripts:
@@ -225,22 +252,22 @@ cargo install zarrs_tools
 export PATH=$PATH:$HOME/.cargo/bin
 ```
 
-### Optimizing chunks and shards
+#### Optimizing chunks and shards
 
 Finally, there is not yet a single heuristic for determining the chunk and shard
 sizes that will work for all data. Pass the `--output-chunks` and
 `--output-shards` flags in order to set the size of chunks and shards for all
 resolutions:
 
 ```
-ome2024-ngff-challenge input.zarr output.zarr --output-chunks=1,1,1,256,256 --output-shards=1,1,1,2048,2048
+ome2024-ngff-challenge resave --cc-by input.zarr output.zarr --output-chunks=1,1,1,256,256 --output-shards=1,1,1,2048,2048
 ```
 
 Alternatively, you can use a JSON file to review and manually optimize the
 chunking and sharding parameters on a per-resolution basis:
 
 ```
-ome2024-ngff-challenge input.zarr parameters.json --output-write-details
+ome2024-ngff-challenge resave --cc-by input.zarr parameters.json --output-write-details
 ```
 
 This will write a JSON file of the form:
@@ -254,7 +281,7 @@ the "multiscales". Edits to this file can be read back in using the
 `output-read-details` flag:
 
 ```
-ome2024-ngff-challenge input.zarr output.zarr --output-read-details=parameters.json
+ome2024-ngff-challenge resave --cc-by input.zarr output.zarr --output-read-details=parameters.json
 ```
 
 Note: Changes to the shape are ignored.

diff --git a/dev2/resave.py b/dev2/resave.py
@@ -329,7 +329,7 @@ def write_rocrate(write_store):
         properties={
             "name": "Light microscopy photo of a fly",
             "description": "Light microscopy photo of a fruit fly.",
-            "licence": "https://creativecommons.org/licenses/by/4.0/",
+            "license": "https://creativecommons.org/licenses/by/4.0/",
         },
     )
     biosample = crate.add(

diff --git a/dev3/2024-07-02/example-metadata/min-specimen-biosample.json b/dev3/2024-07-02/example-metadata/min-specimen-biosample.json
@@ -29,7 +29,7 @@
       "@type": "Dataset",
       "name": "Light microscopy photo of a fly",
       "description": "Light microscopy photo of a fruit fly.",
-      "licence": "https://creativecommons.org/licenses/by/4.0/",
+      "license": "https://creativecommons.org/licenses/by/4.0/",
       "hasPart": {
         "@id": "./dros-mel-image.zarr/"
       }

diff --git a/dev3/2024-07-02/example-metadata/minimal.json b/dev3/2024-07-02/example-metadata/minimal.json
@@ -28,7 +28,7 @@
       "@type": "Dataset",
       "name": "Light microscopy photo of a fly",
       "description": "Light microscopy photo of a fruit fly.",
-      "licence": "https://creativecommons.org/licenses/by/4.0/",
+      "license": "https://creativecommons.org/licenses/by/4.0/",
       "hasPart": {
         "@id": "./dros-mel-image.zarr/"
       }

diff --git a/dev3/2024-07-02/ro-crate-metadata-proposal.md b/dev3/2024-07-02/ro-crate-metadata-proposal.md
@@ -53,7 +53,7 @@ imagining technique.
             "@type": "Dataset",
             "name": "Light microscopy photo of a fly",
             "description": "Light microscopy photo of a fruit fly.",
-            "licence": "https://creativecommons.org/licenses/by/4.0/",
+            "license": "https://creativecommons.org/licenses/by/4.0/",
             "hasPart": {
                 "@id": "./dros-mel-image.zarr/"
             }
@@ -138,7 +138,7 @@ The metadata json file would look like:
             "@type": "Dataset",
             "name": "Light microscopy photo of a fly",
             "description": "Light microscopy photo of a fruit fly.",
-            "licence": "https://creativecommons.org/licenses/by/4.0/",
+            "license": "https://creativecommons.org/licenses/by/4.0/",
             "hasPart": {
                 "@id": "./dros-mel-image.zarr/"
             }

diff --git a/dev3/zarr-crate/example_usage/create_fly_ro_crate_metadata.py b/dev3/zarr-crate/example_usage/create_fly_ro_crate_metadata.py
@@ -11,7 +11,7 @@
         properties={
             "name": "Light microscopy photo of a fly",
             "description": "Light microscopy photo of a fruit fly.",
-            "licence": "https://creativecommons.org/licenses/by/4.0/",
+            "license": "https://creativecommons.org/licenses/by/4.0/",
         },
     )
     biosample = crate.add(

diff --git a/dev3/zarr-crate/example_usage/example_ro_crate/ro-crate-metadata.json b/dev3/zarr-crate/example_usage/example_ro_crate/ro-crate-metadata.json
@@ -23,7 +23,7 @@
       "@type": "Dataset",
       "name": "Light microscopy photo of a fly",
       "description": "Light microscopy photo of a fruit fly.",
-      "licence": "https://creativecommons.org/licenses/by/4.0/",
+      "license": "https://creativecommons.org/licenses/by/4.0/",
       "resultOf": {
         "@id": "#16e30b5b-9995-4ff2-97e6-66a9c025f0d3"
       }

diff --git a/pyproject.toml b/pyproject.toml
@@ -65,7 +65,7 @@ enable = true
 substitution.files = ["src/ome2024_ngff_challenge/__init__.py"]
 
 [tool.poetry.scripts]
-ome2024-ngff-challenge = "ome2024_ngff_challenge.resave:cli"
+ome2024-ngff-challenge = "ome2024_ngff_challenge:dispatch"
 
 [tool.setuptools_scm]
 write_to = "src/ome2024_ngff_challenge/_version.py"

diff --git a/src/ome2024_ngff_challenge/__init__.py b/src/ome2024_ngff_challenge/__init__.py
@@ -6,6 +6,31 @@
 
 from __future__ import annotations
 
+import argparse
+import sys
+
+from .lookup import cli as lookup_cli
+from .resave import cli as resave_cli
+
 __version__ = "0.0.0"
 
 __all__ = ["__version__"]
+
+
+def dispatch(args=sys.argv[1:]):
+    """
+    Parses the arguments contained in `args` and passes
+    them to `main`. If no images are converted, raises
+    SystemExit. Otherwise, return the number of images.
+    """
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    subparsers = parser.add_subparsers(help="subparser help")
+    resave_cli(subparsers)
+    lookup_cli(subparsers)
+    # Upcoming parsers to be moved to submodules
+    subparsers.add_parser("validate", help="TBD: evaluate a converted fileset locally")
+    subparsers.add_parser(
+        "update", help="TBD: updated the RO-Crate metadata in a fileset"
+    )
+    ns = parser.parse_args(args)
+    return ns.func(ns)
diff --git a/src/ome2024_ngff_challenge/lookup.py b/src/ome2024_ngff_challenge/lookup.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+import argparse
+import logging
+
+import requests
+
+from .utils import configure_logging
+
+LOGGER = logging.getLogger(__file__)
+
+
+def cli(subparsers: argparse._SubParsersAction):
+    cmd = "ome2024-ngff-challenge lookup"
+    desc = f"""
+
+
+The `lookup` subcommand will take search the EBI OLS service
+for metadata identifiers matching the given input.
+
+
+BASIC
+
+    Simplest example:                        {cmd} "light-sheet"
+
+
+    """
+    parser = subparsers.add_parser(
+        "lookup",
+        help="lookup metadata from EBI OLS",
+        description=desc,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.set_defaults(func=main)
+    parser.add_argument(
+        "--log", default="info", help="'error', 'warn', 'info', 'debug' or 'trace'"
+    )
+    parser.add_argument("text")
+
+
+def parse(ns: argparse.Namespace):
+    """
+    Parse the namespace arguments provided by the dispatcher
+    """
+
+    configure_logging(ns, LOGGER)
+
+
+def main(ns: argparse.Namespace):
+    text = ns.text
+    url = f"https://www.ebi.ac.uk/ols4/api/search?q={text}&obsoletes=false&local=false&rows=10&start=0&format=json&lang=en"
+    response = requests.get(url, timeout=(5, 30))
+    if response.status_code == 200:
+        result = response.json()
+        docs = result["response"]["docs"]
+        header = "ONTOLOGY  \tTERM                \tLABEL                         \tDESCRIPTION"
+        print(header)  # noqa: T201
+        for doc in docs:
+            onto = doc["ontology_name"]
+            term = doc["short_form"]
+            name = doc["label"]
+            desc = "" if not doc["description"] else doc["description"][0]
+            desc = desc.split("\n")[0][:70]  # At most first 70 chars of first line
+            print(f"""{onto:10s}\t{term:20s}\t{name:30s}\t{desc}""")  # noqa: T201
+
+    else:
+        raise Exception(response)