diff --git a/README.md b/README.md index dae5e11..6005196 100644 --- a/README.md +++ b/README.md @@ -136,43 +136,70 @@ $ ome2024-ngff-challenge --input-bucket=bia-integrator-data --input-endpoint=htt -## Converting your data +## CLI Commands + +### `resave`: convert your data The `ome2024-ngff-challenge` tool can be used to convert an OME-Zarr 0.4 dataset that is based on Zarr v2. The input data will **not be modified** in any way and a full copy of the data will be created at the chosen location. -### Getting started +#### Getting started ``` -ome2024-ngff-challenge input.zarr output.zarr +ome2024-ngff-challenge resave --cc-by input.zarr output.zarr +``` + +is the most basic invocation of the tool. If you do not choose a license, the +application will fail with: + ``` +No license set. Choose one of the Creative Commons license (e.g., `--cc-by`) or skip RO-Crate creation (`--rocrate-skip`) +``` + +#### Licenses + +There are a number of other license options to choose from. We suggest one of: + +- `--cc-by` credit must be given to the creator +- `--cc0`: Add your data to the public domain + +Alternatively, you can choose your own license, e.g., + +`--rocrate-license=https://creativecommons.org/licenses/by-nc/4.0/` + +to restrict commercial use of your data. Additionally, you can disable metadata +collection at all. + +**Note:** you will need to add metadata later for your dataset to be considered +valid. + +#### Re-running the script -is the most basic invocation of the tool. If you would like to re-run the script -with different parameters, you can additionally set `--output-overwrite` to -ignore a previous conversion: +If you would like to re-run the script with different parameters, you can +additionally set `--output-overwrite` to ignore a previous conversion: ``` -ome2024-ngff-challenge input.zarr output.zarr --output-overwrite +ome2024-ngff-challenge resave --cc-by input.zarr output.zarr --output-overwrite ``` -### Writing in parallel +#### Writing in parallel By default, 16 chunks of data will be processed simultaneously in order to bound memory usage. You can increase this number based on your local resources: ``` -ome2024-ngff-challenge input.zarr output.zarr --output-threads=128 +ome2024-ngff-challenge resave --cc-by input.zarr output.zarr --output-threads=128 ``` -### Reading/writing remotely +#### Reading/writing remotely If you would like to avoid downloading and/or upload the Zarr datasets, you can set S3 parameters on the command-line which will then treat the input and/or output datasets as a prefix within an S3 bucket: ``` -ome2024-ngff-challenge \ +ome2024-ngff-challenge resave --cc-by \ --input-bucket=BUCKET \ --input-endpoint=HOST \ --input-anon \ @@ -183,7 +210,7 @@ ome2024-ngff-challenge \ A small example you can try yourself: ``` -ome2024-ngff-challenge \ +ome2024-ngff-challenge resave --cc-by \ --input-bucket=idr \ --input-endpoint=https://uk1s3.embassy.ebi.ac.uk \ --input-anon \ @@ -191,7 +218,7 @@ ome2024-ngff-challenge \ /tmp/6001240.zarr ``` -### Reading/writing via a script +#### Reading/writing via a script Another R/W option is to have `resave.py` generate a script which you can execute later. If you pass `--output-script`, then rather than generate the @@ -201,7 +228,7 @@ executed later. For example, running: ``` -ome2024-ngff-challenge dev2/input.zarr /tmp/scripts.zarr --output-script +ome2024-ngff-challenge resave --cc-by dev2/input.zarr /tmp/scripts.zarr --output-script ``` produces a dataset with one `zarr.json` file and 3 `convert.sh` scripts: @@ -225,7 +252,7 @@ cargo install zarrs_tools export PATH=$PATH:$HOME/.cargo/bin ``` -### Optimizing chunks and shards +#### Optimizing chunks and shards Finally, there is not yet a single heuristic for determining the chunk and shard sizes that will work for all data. Pass the `--output-chunks` and @@ -233,14 +260,14 @@ sizes that will work for all data. Pass the `--output-chunks` and resolutions: ``` -ome2024-ngff-challenge input.zarr output.zarr --output-chunks=1,1,1,256,256 --output-shards=1,1,1,2048,2048 +ome2024-ngff-challenge resave --cc-by input.zarr output.zarr --output-chunks=1,1,1,256,256 --output-shards=1,1,1,2048,2048 ``` Alternatively, you can use a JSON file to review and manually optimize the chunking and sharding parameters on a per-resolution basis: ``` -ome2024-ngff-challenge input.zarr parameters.json --output-write-details +ome2024-ngff-challenge resave --cc-by input.zarr parameters.json --output-write-details ``` This will write a JSON file of the form: @@ -254,7 +281,7 @@ the "multiscales". Edits to this file can be read back in using the `output-read-details` flag: ``` -ome2024-ngff-challenge input.zarr output.zarr --output-read-details=parameters.json +ome2024-ngff-challenge resave --cc-by input.zarr output.zarr --output-read-details=parameters.json ``` Note: Changes to the shape are ignored. diff --git a/dev2/resave.py b/dev2/resave.py index 96dab63..93ad16f 100755 --- a/dev2/resave.py +++ b/dev2/resave.py @@ -329,7 +329,7 @@ def write_rocrate(write_store): properties={ "name": "Light microscopy photo of a fly", "description": "Light microscopy photo of a fruit fly.", - "licence": "https://creativecommons.org/licenses/by/4.0/", + "license": "https://creativecommons.org/licenses/by/4.0/", }, ) biosample = crate.add( diff --git a/dev3/2024-07-02/example-metadata/min-specimen-biosample.json b/dev3/2024-07-02/example-metadata/min-specimen-biosample.json index 374d7b7..320b7ea 100644 --- a/dev3/2024-07-02/example-metadata/min-specimen-biosample.json +++ b/dev3/2024-07-02/example-metadata/min-specimen-biosample.json @@ -29,7 +29,7 @@ "@type": "Dataset", "name": "Light microscopy photo of a fly", "description": "Light microscopy photo of a fruit fly.", - "licence": "https://creativecommons.org/licenses/by/4.0/", + "license": "https://creativecommons.org/licenses/by/4.0/", "hasPart": { "@id": "./dros-mel-image.zarr/" } diff --git a/dev3/2024-07-02/example-metadata/minimal.json b/dev3/2024-07-02/example-metadata/minimal.json index 7eb4815..b6b61a4 100644 --- a/dev3/2024-07-02/example-metadata/minimal.json +++ b/dev3/2024-07-02/example-metadata/minimal.json @@ -28,7 +28,7 @@ "@type": "Dataset", "name": "Light microscopy photo of a fly", "description": "Light microscopy photo of a fruit fly.", - "licence": "https://creativecommons.org/licenses/by/4.0/", + "license": "https://creativecommons.org/licenses/by/4.0/", "hasPart": { "@id": "./dros-mel-image.zarr/" } diff --git a/dev3/2024-07-02/ro-crate-metadata-proposal.md b/dev3/2024-07-02/ro-crate-metadata-proposal.md index 5958cff..e29f464 100644 --- a/dev3/2024-07-02/ro-crate-metadata-proposal.md +++ b/dev3/2024-07-02/ro-crate-metadata-proposal.md @@ -53,7 +53,7 @@ imagining technique. "@type": "Dataset", "name": "Light microscopy photo of a fly", "description": "Light microscopy photo of a fruit fly.", - "licence": "https://creativecommons.org/licenses/by/4.0/", + "license": "https://creativecommons.org/licenses/by/4.0/", "hasPart": { "@id": "./dros-mel-image.zarr/" } @@ -138,7 +138,7 @@ The metadata json file would look like: "@type": "Dataset", "name": "Light microscopy photo of a fly", "description": "Light microscopy photo of a fruit fly.", - "licence": "https://creativecommons.org/licenses/by/4.0/", + "license": "https://creativecommons.org/licenses/by/4.0/", "hasPart": { "@id": "./dros-mel-image.zarr/" } diff --git a/dev3/zarr-crate/example_usage/create_fly_ro_crate_metadata.py b/dev3/zarr-crate/example_usage/create_fly_ro_crate_metadata.py index a74b673..deae3c8 100644 --- a/dev3/zarr-crate/example_usage/create_fly_ro_crate_metadata.py +++ b/dev3/zarr-crate/example_usage/create_fly_ro_crate_metadata.py @@ -11,7 +11,7 @@ properties={ "name": "Light microscopy photo of a fly", "description": "Light microscopy photo of a fruit fly.", - "licence": "https://creativecommons.org/licenses/by/4.0/", + "license": "https://creativecommons.org/licenses/by/4.0/", }, ) biosample = crate.add( diff --git a/dev3/zarr-crate/example_usage/example_ro_crate/ro-crate-metadata.json b/dev3/zarr-crate/example_usage/example_ro_crate/ro-crate-metadata.json index 50c06fb..0e184e9 100644 --- a/dev3/zarr-crate/example_usage/example_ro_crate/ro-crate-metadata.json +++ b/dev3/zarr-crate/example_usage/example_ro_crate/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "name": "Light microscopy photo of a fly", "description": "Light microscopy photo of a fruit fly.", - "licence": "https://creativecommons.org/licenses/by/4.0/", + "license": "https://creativecommons.org/licenses/by/4.0/", "resultOf": { "@id": "#16e30b5b-9995-4ff2-97e6-66a9c025f0d3" } diff --git a/pyproject.toml b/pyproject.toml index 48c18ca..d237599 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,7 @@ enable = true substitution.files = ["src/ome2024_ngff_challenge/__init__.py"] [tool.poetry.scripts] -ome2024-ngff-challenge = "ome2024_ngff_challenge.resave:cli" +ome2024-ngff-challenge = "ome2024_ngff_challenge:dispatch" [tool.setuptools_scm] write_to = "src/ome2024_ngff_challenge/_version.py" diff --git a/src/ome2024_ngff_challenge/__init__.py b/src/ome2024_ngff_challenge/__init__.py index 3307871..10629b2 100644 --- a/src/ome2024_ngff_challenge/__init__.py +++ b/src/ome2024_ngff_challenge/__init__.py @@ -6,6 +6,31 @@ from __future__ import annotations +import argparse +import sys + +from .lookup import cli as lookup_cli +from .resave import cli as resave_cli + __version__ = "0.0.0" __all__ = ["__version__"] + + +def dispatch(args=sys.argv[1:]): + """ + Parses the arguments contained in `args` and passes + them to `main`. If no images are converted, raises + SystemExit. Otherwise, return the number of images. + """ + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + subparsers = parser.add_subparsers(help="subparser help") + resave_cli(subparsers) + lookup_cli(subparsers) + # Upcoming parsers to be moved to submodules + subparsers.add_parser("validate", help="TBD: evaluate a converted fileset locally") + subparsers.add_parser( + "update", help="TBD: updated the RO-Crate metadata in a fileset" + ) + ns = parser.parse_args(args) + return ns.func(ns) diff --git a/src/ome2024_ngff_challenge/lookup.py b/src/ome2024_ngff_challenge/lookup.py new file mode 100644 index 0000000..6b63a18 --- /dev/null +++ b/src/ome2024_ngff_challenge/lookup.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +import argparse +import logging + +import requests + +from .utils import configure_logging + +LOGGER = logging.getLogger(__file__) + + +def cli(subparsers: argparse._SubParsersAction): + cmd = "ome2024-ngff-challenge lookup" + desc = f""" + + +The `lookup` subcommand will take search the EBI OLS service +for metadata identifiers matching the given input. + + +BASIC + + Simplest example: {cmd} "light-sheet" + + + """ + parser = subparsers.add_parser( + "lookup", + help="lookup metadata from EBI OLS", + description=desc, + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.set_defaults(func=main) + parser.add_argument( + "--log", default="info", help="'error', 'warn', 'info', 'debug' or 'trace'" + ) + parser.add_argument("text") + + +def parse(ns: argparse.Namespace): + """ + Parse the namespace arguments provided by the dispatcher + """ + + configure_logging(ns, LOGGER) + + +def main(ns: argparse.Namespace): + text = ns.text + url = f"https://www.ebi.ac.uk/ols4/api/search?q={text}&obsoletes=false&local=false&rows=10&start=0&format=json&lang=en" + response = requests.get(url, timeout=(5, 30)) + if response.status_code == 200: + result = response.json() + docs = result["response"]["docs"] + header = "ONTOLOGY \tTERM \tLABEL \tDESCRIPTION" + print(header) # noqa: T201 + for doc in docs: + onto = doc["ontology_name"] + term = doc["short_form"] + name = doc["label"] + desc = "" if not doc["description"] else doc["description"][0] + desc = desc.split("\n")[0][:70] # At most first 70 chars of first line + print(f"""{onto:10s}\t{term:20s}\t{name:30s}\t{desc}""") # noqa: T201 + + else: + raise Exception(response) diff --git a/src/ome2024_ngff_challenge/resave.py b/src/ome2024_ngff_challenge/resave.py index a0db419..2ff33ae 100755 --- a/src/ome2024_ngff_challenge/resave.py +++ b/src/ome2024_ngff_challenge/resave.py @@ -2,368 +2,35 @@ from __future__ import annotations import argparse -import itertools import json import logging import math import random -import shutil -import sys import time -from importlib.metadata import version as lib_version +import warnings from pathlib import Path -import numpy as np import tensorstore as ts import tqdm -import zarr -from zarr.api.synchronous import sync -from zarr.buffer import Buffer, BufferPrototype +from .utils import ( + Batched, + Config, + SafeEncoder, + TSMetrics, + add_creator, + chunk_iter, + configure_logging, + csv_int, + guess_shards, + strip_version, +) from .zarr_crate.rembi_extension import Biosample, ImageAcquistion, Specimen from .zarr_crate.zarr_extension import ZarrCrate NGFF_VERSION = "0.5" LOGGER = logging.getLogger(__file__) -# -# Helpers -# - - -class Batched: - """ - implementation of itertools.batched for pre-3.12 Python versions - from https://mathspp.com/blog/itertools-batched - """ - - def __init__(self, iterable, n: int): - if n < 1: - msg = f"n must be at least one ({n})" - raise ValueError(msg) - self.iter = iter(iterable) - self.n = n - - def __iter__(self): - return self - - def __next__(self): - batch = tuple(itertools.islice(self.iter, self.n)) - if not batch: - raise StopIteration() - return batch - - -class SafeEncoder(json.JSONEncoder): - # Handle any TypeErrors so we are safe to use this for logging - # E.g. dtype obj is not JSON serializable - def default(self, o): - try: - return super().default(o) - except TypeError: - return str(o) - - -def guess_shards(shape: list, chunks: list): - """ - Method to calculate best shard sizes. These values can be written to - a file for the current dataset by using: - - ./resave.py input.zarr output.json --output-write-details - """ - # TODO: hard-coded to return the full size - assert chunks is not None # fixes unused parameter - return shape - - -def chunk_iter(shape: list, chunks: list): - """ - Returns a series of tuples, each containing chunk slice - E.g. for 2D shape/chunks: ((slice(0, 512, 1), slice(0, 512, 1)), (slice(0, 512, 1), slice(512, 1024, 1))...) - Thanks to Davis Bennett. - """ - assert len(shape) == len(chunks) - chunk_iters = [] - for chunk_size, dim_size in zip(chunks, shape): - chunk_tuple = tuple( - slice( - c_index * chunk_size, - min(dim_size, c_index * chunk_size + chunk_size), - 1, - ) - for c_index in range(-(-dim_size // chunk_size)) - ) - chunk_iters.append(chunk_tuple) - return tuple(itertools.product(*chunk_iters)) - - -def csv_int(vstr, sep=",") -> list: - """Convert a string of comma separated values to integers - @returns iterable of floats - """ - values = [] - for v0 in vstr.split(sep): - try: - v = int(v0) - values.append(v) - except ValueError as ve: - raise argparse.ArgumentError( - message=f"Invalid value {v0}, values must be a number" - ) from ve - return values - - -def strip_version(possible_dict) -> None: - """ - If argument is a dict with the key "version", remove it - """ - if isinstance(possible_dict, dict) and "version" in possible_dict: - del possible_dict["version"] - - -def add_creator(json_dict) -> None: - # Add _creator - NB: this will overwrite any existing _creator info - pkg_version = lib_version("ome2024-ngff-challenge") - json_dict["_creator"] = {"name": "ome2024-ngff-challenge", "version": pkg_version} - - -class TextBuffer(Buffer): - """ - Zarr Buffer implementation that simplify saves text at a given location. - """ - - def __init__(self, text): - self.text = text - if isinstance(text, str): - text = np.array(text.encode()) - self._data = text - - -class TSMetrics: - """ - Instances of this class capture the current tensorstore metrics. - - If an existing instance is passed in on creation, it will be stored - in order to deduct previous values from those measured by this instance. - """ - - CHUNK_CACHE_READS = "/tensorstore/cache/chunk_cache/reads" - CHUNK_CACHE_WRITES = "/tensorstore/cache/chunk_cache/writes" - - BATCH_READ = "/tensorstore/kvstore/{store_type}/batch_read" - BYTES_READ = "/tensorstore/kvstore/{store_type}/bytes_read" - BYTES_WRITTEN = "/tensorstore/kvstore/{store_type}/bytes_written" - - OTHER = ( - "/tensorstore/cache/hit_count" - "/tensorstore/cache/kvs_cache_read" - "/tensorstore/cache/miss_count" - "/tensorstore/kvstore/{store_type}/delete_range" - "/tensorstore/kvstore/{store_type}/open_read" - "/tensorstore/kvstore/{store_type}/read" - "/tensorstore/kvstore/{store_type}/write" - "/tensorstore/thread_pool/active" - "/tensorstore/thread_pool/max_delay_ns" - "/tensorstore/thread_pool/started" - "/tensorstore/thread_pool/steal_count" - "/tensorstore/thread_pool/task_providers" - "/tensorstore/thread_pool/total_queue_time_ns" - "/tensorstore/thread_pool/work_time_ns" - ) - - def __init__(self, read_config, write_config, start=None): - self.time = time.time() - self.read_type = read_config["kvstore"]["driver"] - self.write_type = write_config["kvstore"]["driver"] - self.start = start - self.data = ts.experimental_collect_matching_metrics() - - def value(self, key): - rv = None - for item in self.data: - k = item["name"] - v = item["values"] - if k == key: - if len(v) > 1: - raise Exception(f"Multiple values for {key}: {v}") - rv = v[0]["value"] - break - if rv is None: - raise Exception(f"unknown key: {key} from {self.data}") - - orig = self.start.value(key) if self.start is not None else 0 - - return rv - orig - - def read(self): - return self.value(self.BYTES_READ.format(store_type=self.read_type)) - - def written(self): - return self.value(self.BYTES_WRITTEN.format(store_type=self.write_type)) - - def elapsed(self): - return self.start is not None and (self.time - self.start.time) or self.time - - -class Config: - """ - Filesystem and S3 configuration information for both tensorstore and zarr-python - """ - - def __init__( - self, - ns: argparse.Namespace, - selection: str, - mode: str, - subpath: Path | str | None = None, - ): - self.ns = ns - self.selection = selection - self.mode = mode - self.subpath = None if not subpath else Path(subpath) - - self.overwrite = False - if selection == "output": - self.overwrite = ns.output_overwrite - - self.path = getattr(ns, f"{selection}_path") - self.anon = getattr(ns, f"{selection}_anon") - self.bucket = getattr(ns, f"{selection}_bucket") - self.endpoint = getattr(ns, f"{selection}_endpoint") - self.region = getattr(ns, f"{selection}_region") - - if self.bucket: - self.ts_store = { - "driver": "s3", - "bucket": self.bucket, - "aws_region": self.region, - } - if self.anon: - self.ts_store["aws_credentials"] = {"anonymous": self.anon} - if self.endpoint: - self.ts_store["endpoint"] = self.endpoint - - store_class = zarr.store.RemoteStore - self.zr_store = store_class( - url=self.s3_string(), - anon=self.anon, - endpoint_url=self.endpoint, - mode=mode, - ) - - else: - self.ts_store = { - "driver": "file", - } - - store_class = zarr.store.LocalStore - self.zr_store = store_class(self.fs_string(), mode=mode) - - self.ts_store["path"] = self.fs_string() - self.ts_config = { - "driver": "zarr" if selection == "input" else "zarr3", - "kvstore": self.ts_store, - } - - self.zr_group = None - self.zr_attrs = None - - def s3_string(self): - return f"s3://{self.bucket}/{self.fs_string()}" - - def fs_string(self): - return str(self.path / self.subpath) if self.subpath else str(self.path) - - def is_s3(self): - return bool(self.bucket) - - def s3_endpoint(self): - """ - Returns a representation of the S3 endpoint set on this configuration. - - * "" if this is not an S3 configuration - * "default" if no explicit endpoint is set - * otherwise the URL is returned - """ - if self.is_s3(): - if self.endpoint: - return self.endpoint - return "default" - return "" - - def __str__(self): - if self.is_s3(): - return self.s3_string() - return self.fs_string() - - def __repr__(self): - return ( - f"Config<{self.__str__()}, {self.selection}, {self.mode}, {self.overwrite}>" - ) - - def check_or_delete_path(self): - # If this is local, then delete. - if self.bucket: - raise Exception(f"bucket set ({self.bucket}). Refusing to delete.") - - if self.path.exists(): - # TODO: This should really be an option on zarr-python - # as with tensorstore. - if self.overwrite: - if self.path.is_file(): - self.path.unlink() - else: - shutil.rmtree(self.path) - else: - raise Exception( - f"{self.path} exists. Use --output-overwrite to overwrite" - ) - - def open_group(self): - # Needs zarr_format=2 or we get ValueError("store mode does not support writing") - self.zr_group = zarr.open_group(store=self.zr_store, zarr_format=2) - self.zr_attrs = self.zr_group.attrs - - def create_group(self): - self.zr_group = zarr.Group.create(self.zr_store) - self.zr_attrs = self.zr_group.attrs - - def sub_config(self, subpath: str, create_or_open_group: bool = True): - sub = Config( - self.ns, - self.selection, - self.mode, - subpath if not self.subpath else self.subpath / subpath, - ) - if create_or_open_group: - if sub.selection == "input": - sub.open_group() - elif sub.selection == "output": - sub.create_group() - else: - raise Exception(f"unknown selection: {self.selection}") - return sub - - def ts_read(self): - return ts.open(self.ts_config).result() - - def zr_write_text(self, path: Path, text: str): - text = TextBuffer(text) - filename = self.path / self.subpath / path if self.subpath else self.path / path - sync(self.zr_store.set(str(filename), text)) - - def zr_read_text(self, path: str | Path): - return sync( - self.zr_store.get(str(path), prototype=BufferPrototype(TextBuffer, None)) - ) - - -class Location: - """ - High-level collection of objects and configuration - options related to a source or target location for conversion. - """ - def convert_array( input_config: Config, @@ -591,7 +258,7 @@ def convert_image( dimsn_txt = ",".join(map(str, dimension_names)) output_config.zr_write_text( Path(ds_path) / "convert.sh", - f"zarrs_reencode --chunk-shape {chunk_txt} --shard-shape {shard_txt} --dimension-names {dimsn_txt} --validate {input_config} {output_config}\n", + f"zarrs_reencode --chunk-shape {chunk_txt} --shard-shape {shard_txt} --dimension-names {dimsn_txt} --validate {ds_input_config} {ds_output_config}\n", ) else: convert_array( @@ -646,9 +313,9 @@ def convert_image( class ROCrateWriter: def __init__( self, - name: str = "dataset name", - description: str = "dataset description", - data_license: str = "https://creativecommons.org/licenses/by/4.0/", + name: str | None = None, + description: str | None = None, + data_license: str | None = None, organism: str | None = None, modality: str | None = None, ): @@ -669,11 +336,19 @@ def properties(self) -> dict: Return a dictionary containing the base properties like name, description, and license """ - return { - "name": self.name, - "description": self.description, - "licence": self.data_license, - } + + values = {} + if self.name: + values["name"] = self.name + if self.description: + values["description"] = self.description + + if self.data_license: + values["license"] = self.data_license + else: + warnings.warn("No license specified!", stacklevel=1) + + return values def generate(self, dataset="./") -> None: """ @@ -722,11 +397,16 @@ def write( config.zr_write_text(filename, text) -def main(ns: argparse.Namespace, rocrate: ROCrateWriter | None = None) -> int: +def main(ns: argparse.Namespace) -> int: """ - Returns the number of images converted + If no images are converted, raises + SystemExit. Otherwise, return the number of images. """ - converted = 0 + + converted: int = 0 + + parse(ns) + rocrate: ROCrateWriter = ns.rocrate input_config = Config(ns, "input", "r") output_config = Config(ns, "output", "w") @@ -856,16 +536,108 @@ def main(ns: argparse.Namespace, rocrate: ROCrateWriter | None = None) -> int: else: LOGGER.warning(f"no convertible metadata: {input_config.zr_attrs.keys()}") + if converted == 0: + raise SystemExit(1) return converted -def cli(args=sys.argv[1:]): +def cli(subparsers: argparse._SubParsersAction): """ Parses the arguments contained in `args` and passes them to `main`. If no images are converted, raises SystemExit. Otherwise, return the number of images. """ - parser = argparse.ArgumentParser() + cmd = "ome2024-ngff-challenge resave" + desc = f""" + + +The `resave` subcommand will convert an existing Zarr v2 dataset into a Zarr v3 dataset according +to the challenge specification. Additionally, a number of options are available for adding metadata + + + +BASIC + + Simplest example: {cmd} --cc-by in.zarr out.zarr + Overwrite existing output: {cmd} --cc-by in.zarr out.zarr --output-overwrite + + +METADATA + + There are three levels of metadata that the challenge is looking for: + + - strongly recommended: license + - recommended: organism and modality + - optional: name and description + + License: CC-BY (most suggested) {cmd} in.zarr out.zarr --cc-by + License: public domain {cmd} in.zarr out.zarr --cc0 + License: choose your own {cmd} in.zarr out.zarr --rocrate-license=https://creativecommons.org/licenses/by-sa/4.0/ + + Organism: Arabidopsis thaliana {cmd} in.zarr out.zarr --cc0 --rocrate-organism=NCBI:txid3702 + Organism: Drosophila melanogaster {cmd} in.zarr out.zarr --cc0 --rocrate-organism=NCBI:txid7227 + Organism: Escherichia coli {cmd} in.zarr out.zarr --cc0 --rocrate-organism=NCBI:txid562 + Organism: Homo sapiens {cmd} in.zarr out.zarr --cc0 --rocrate-organism=NCBI:txid9606 + Organism: Mus musculus {cmd} in.zarr out.zarr --cc0 --rocrate-organism=NCBI:txid10090 + Organism: Saccharomyces cerevisiae {cmd} in.zarr out.zarr --cc0 --rocrate-organism=NCBI:txid4932 + + Modality: bright-field microscopy {cmd} in.zarr out.zarr --cc0 --rocrate-modality=obo:FBbi_00000243 + Modality: confocal microscopy {cmd} in.zarr out.zarr --cc0 --rocrate-modality=obo:FBbi_00000251 + Modality: light-sheet microscopy (SPIM) {cmd} in.zarr out.zarr --cc0 --rocrate-modality=obo:FBbi_00000369 + Modality: scanning electron microscopy {cmd} in.zarr out.zarr --cc0 --rocrate-modality=obo:FBbi_00000257 + Modality: two-photon laser scanning {cmd} in.zarr out.zarr --cc0 --rocrate-modality=obo:FBbi_00000253 + + Define a name {cmd} --cc-by in.zarr out.zarr --rocrate-name="my experiment" + Define a description {cmd} --cc-by in.zarr out.zarr --rocrate-description="More text here" + + No metadata (INVALID DATASET!) {cmd} --rocrate-skip in.zarr out.zarr + + For more information see the online resources for each metadata term: + - https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/ + - https://www.ebi.ac.uk/ols4/ontologies/fbbi + + +CHUNKS/SHARDS + + With the introduction of sharding, it may be necessary to choose a different chunk + size for your dataset. + + Set the same value for all resolutions {cmd} --cc-by in.zarr out.zarr --output-chunks=1,1,1,256,256 --output-shards=1,1,1,2048,2048 + Log the current values for all images {cmd} --cc-by in.zarr cfg.json --output-write-details + Read values from an edited config file {cmd} --cc-by in.zarr out.zarr --output-read-details=cfg.json + + +REMOTE (S3) + + For both the input and output filesets, a number of arguments are available: + + * bucket (required): setting this activates remote access + * endpoint (optional): if not using AWS S3, set this to your provider's endpoint + * region (optional): set the region that you would like to access + * anon (optional): do not attempt to authenticate with the service + + By default, S3 access will try to make use of your environment variables (e.g. AWS_ACCESS_KEY_ID) + or your local configuration (~/.aws) which you may need to deactivate. + + Read from IDR's bucket: {cmd} --cc-by bucket-path/in.zarr out.zarr \\ + --input-anon \\ + --input-bucket=idr \\ + --input-endpoint=https://uk1s3.embassy.ebi.ac.uk + +ADVANCED + + Prepare scripts for conversion. {cmd} --cc-by in.zarr out.zarr --output-script + Set number of parallel threads {cmd} --cc-by in.zarr out.zarr --output-threads=128 + Increase logging {cmd} --cc-by in.zarr out.zarr --log=debug + Increase logging even more {cmd} --cc-by in.zarr out.zarr --log=trace + """ + parser = subparsers.add_parser( + "resave", + help="convert Zarr v2 dataset to Zarr v3", + description=desc, + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.set_defaults(func=main) parser.add_argument("--input-bucket") parser.add_argument("--input-endpoint") parser.add_argument("--input-anon", action="store_true") @@ -874,20 +646,75 @@ def cli(args=sys.argv[1:]): parser.add_argument("--output-endpoint") parser.add_argument("--output-anon", action="store_true") parser.add_argument("--output-region", default="us-east-1") - parser.add_argument("--output-overwrite", action="store_true") - parser.add_argument("--output-script", action="store_true") + parser.add_argument( + "--output-overwrite", + action="store_true", + help="CAUTION: Overwrite a previous conversion run", + ) + parser.add_argument( + "--output-script", + action="store_true", + help="CAUTION: Do not run conversion. Instead prepare scripts for later conversion", + ) parser.add_argument( "--output-threads", type=int, default=16, help="number of simultaneous write threads", ) - parser.add_argument("--rocrate-name", type=str) - parser.add_argument("--rocrate-description", type=str) - parser.add_argument("--rocrate-license", type=str) - parser.add_argument("--rocrate-organism", type=str) - parser.add_argument("--rocrate-modality", type=str) - parser.add_argument("--rocrate-skip", action="store_true") + + # Very recommended metadata (SHOULD!) + def license_action(group, arg: str, url: str, recommended: bool = True): + class LicenseAction(argparse.Action): + def __call__(self, parser, args, *unused, **ignore): # noqa: ARG002 + args.rocrate_license = url + if not recommended: + warnings.warn( + f"This license is not recommended: {url}", stacklevel=1 + ) + + desc = url + if not recommended: + desc = "(not recommended) " + url + group.add_argument(arg, action=LicenseAction, nargs=0, help=desc) + + group_lic = parser.add_mutually_exclusive_group() + license_action( + group_lic, "--cc0", "https://creativecommons.org/publicdomain/zero/1.0/" + ) + license_action(group_lic, "--cc-by", "https://creativecommons.org/licenses/by/4.0/") + group_lic.add_argument( + "--rocrate-license", + type=str, + help="URL to another license, e.g., 'https://creativecommons.org/licenses/by/4.0/'", + ) + + # Recommended metadata (SHOULD) + parser.add_argument( + "--rocrate-organism", + type=str, + help="NCBI identifier of the form 'NCBI:txid7227'", + ) + parser.add_argument( + "--rocrate-modality", + type=str, + help="FBbi identifier of the form 'obo:FBbi_00000243'", + ) + + # Optional metadata (MAY) + parser.add_argument( + "--rocrate-name", + type=str, + help="optional name of the dataset; taken from the NGFF metadata if available", + ) + parser.add_argument( + "--rocrate-description", type=str, help="optional description of the dataset" + ) + parser.add_argument( + "--rocrate-skip", + action="store_true", + help="skips the creation of the RO-Crate file", + ) parser.add_argument( "--log", default="warn", help="'error', 'warn', 'info', 'debug' or 'trace'" ) @@ -912,36 +739,23 @@ def cli(args=sys.argv[1:]): ) parser.add_argument("input_path", type=Path) parser.add_argument("output_path", type=Path) - ns = parser.parse_args(args) - # configure logging - if ns.log.upper() == "TRACE": - numeric_level = 5 - else: - numeric_level = getattr(logging, ns.log.upper(), None) - if not isinstance(numeric_level, int): - raise ValueError(f"Invalid log level: {ns.log}. Use 'info' or 'debug'") - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - LOGGER.setLevel(numeric_level) - rocrate = None +def parse(ns: argparse.Namespace): + """ + Parse the namespace arguments provided by the dispatcher + """ + configure_logging(ns, LOGGER) + + ns.rocrate = None if not ns.rocrate_skip: setup = {} - for key in ("name", "description", "license", "organism", "modality"): + for key in ("name", "description", "organism", "modality"): value = getattr(ns, f"rocrate_{key}", None) if value: setup[key] = value - rocrate = ROCrateWriter(**setup) - - converted = main(ns, rocrate) - if converted == 0: - raise SystemExit(1) - return converted - - -if __name__ == "__main__": - cli(sys.argv[1:]) + if not ns.rocrate_license: + message = "No license set. Choose one of the Creative Commons license (e.g., `--cc-by`) or skip RO-Crate creation (`--rocrate-skip`)" + raise SystemExit(message) + setup["data_license"] = ns.rocrate_license + ns.rocrate = ROCrateWriter(**setup) diff --git a/src/ome2024_ngff_challenge/utils.py b/src/ome2024_ngff_challenge/utils.py new file mode 100755 index 0000000..b0cd852 --- /dev/null +++ b/src/ome2024_ngff_challenge/utils.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python +from __future__ import annotations + +import argparse +import itertools +import json +import logging +import shutil +import time +from importlib.metadata import version as lib_version +from pathlib import Path + +import numpy as np +import tensorstore as ts +import zarr +from zarr.api.synchronous import sync +from zarr.buffer import Buffer, BufferPrototype + + +class Batched: + """ + implementation of itertools.batched for pre-3.12 Python versions + from https://mathspp.com/blog/itertools-batched + """ + + def __init__(self, iterable, n: int): + if n < 1: + msg = f"n must be at least one ({n})" + raise ValueError(msg) + self.iter = iter(iterable) + self.n = n + + def __iter__(self): + return self + + def __next__(self): + batch = tuple(itertools.islice(self.iter, self.n)) + if not batch: + raise StopIteration() + return batch + + +class SafeEncoder(json.JSONEncoder): + # Handle any TypeErrors so we are safe to use this for logging + # E.g. dtype obj is not JSON serializable + def default(self, o): + try: + return super().default(o) + except TypeError: + return str(o) + + +def configure_logging(ns: argparse.Namespace, logger: logging.Logger): + if ns.log.upper() == "TRACE": + numeric_level = 5 + else: + numeric_level = getattr(logging, ns.log.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError(f"Invalid log level: {ns.log}. Use 'info' or 'debug'") + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + logger.setLevel(numeric_level) + + +def guess_shards(shape: list, chunks: list): + """ + Method to calculate best shard sizes. These values can be written to + a file for the current dataset by using: + + ./resave.py input.zarr output.json --output-write-details + """ + # TODO: hard-coded to return the full size + assert chunks is not None # fixes unused parameter + return shape + + +def chunk_iter(shape: list, chunks: list): + """ + Returns a series of tuples, each containing chunk slice + E.g. for 2D shape/chunks: ((slice(0, 512, 1), slice(0, 512, 1)), (slice(0, 512, 1), slice(512, 1024, 1))...) + Thanks to Davis Bennett. + """ + assert len(shape) == len(chunks) + chunk_iters = [] + for chunk_size, dim_size in zip(chunks, shape): + chunk_tuple = tuple( + slice( + c_index * chunk_size, + min(dim_size, c_index * chunk_size + chunk_size), + 1, + ) + for c_index in range(-(-dim_size // chunk_size)) + ) + chunk_iters.append(chunk_tuple) + return tuple(itertools.product(*chunk_iters)) + + +def csv_int(vstr, sep=",") -> list: + """Convert a string of comma separated values to integers + @returns iterable of floats + """ + values = [] + for v0 in vstr.split(sep): + try: + v = int(v0) + values.append(v) + except ValueError as ve: + raise argparse.ArgumentError( + message=f"Invalid value {v0}, values must be a number" + ) from ve + return values + + +def strip_version(possible_dict) -> None: + """ + If argument is a dict with the key "version", remove it + """ + if isinstance(possible_dict, dict) and "version" in possible_dict: + del possible_dict["version"] + + +def add_creator(json_dict) -> None: + # Add _creator - NB: this will overwrite any existing _creator info + pkg_version = lib_version("ome2024-ngff-challenge") + json_dict["_creator"] = {"name": "ome2024-ngff-challenge", "version": pkg_version} + + +class TextBuffer(Buffer): + """ + Zarr Buffer implementation that simplify saves text at a given location. + """ + + def __init__(self, text): + self.text = text + if isinstance(text, str): + text = np.array(text.encode()) + self._data = text + + +class TSMetrics: + """ + Instances of this class capture the current tensorstore metrics. + + If an existing instance is passed in on creation, it will be stored + in order to deduct previous values from those measured by this instance. + """ + + CHUNK_CACHE_READS = "/tensorstore/cache/chunk_cache/reads" + CHUNK_CACHE_WRITES = "/tensorstore/cache/chunk_cache/writes" + + BATCH_READ = "/tensorstore/kvstore/{store_type}/batch_read" + BYTES_READ = "/tensorstore/kvstore/{store_type}/bytes_read" + BYTES_WRITTEN = "/tensorstore/kvstore/{store_type}/bytes_written" + + OTHER = ( + "/tensorstore/cache/hit_count" + "/tensorstore/cache/kvs_cache_read" + "/tensorstore/cache/miss_count" + "/tensorstore/kvstore/{store_type}/delete_range" + "/tensorstore/kvstore/{store_type}/open_read" + "/tensorstore/kvstore/{store_type}/read" + "/tensorstore/kvstore/{store_type}/write" + "/tensorstore/thread_pool/active" + "/tensorstore/thread_pool/max_delay_ns" + "/tensorstore/thread_pool/started" + "/tensorstore/thread_pool/steal_count" + "/tensorstore/thread_pool/task_providers" + "/tensorstore/thread_pool/total_queue_time_ns" + "/tensorstore/thread_pool/work_time_ns" + ) + + def __init__(self, read_config, write_config, start=None): + self.time = time.time() + self.read_type = read_config["kvstore"]["driver"] + self.write_type = write_config["kvstore"]["driver"] + self.start = start + self.data = ts.experimental_collect_matching_metrics() + + def value(self, key): + rv = None + for item in self.data: + k = item["name"] + v = item["values"] + if k == key: + if len(v) > 1: + raise Exception(f"Multiple values for {key}: {v}") + rv = v[0]["value"] + break + if rv is None: + raise Exception(f"unknown key: {key} from {self.data}") + + orig = self.start.value(key) if self.start is not None else 0 + + return rv - orig + + def read(self): + return self.value(self.BYTES_READ.format(store_type=self.read_type)) + + def written(self): + return self.value(self.BYTES_WRITTEN.format(store_type=self.write_type)) + + def elapsed(self): + return self.start is not None and (self.time - self.start.time) or self.time + + +class Config: + """ + Filesystem and S3 configuration information for both tensorstore and zarr-python + """ + + def __init__( + self, + ns: argparse.Namespace, + selection: str, + mode: str, + subpath: Path | str | None = None, + ): + self.ns = ns + self.selection = selection + self.mode = mode + self.subpath = None if not subpath else Path(subpath) + + self.overwrite = False + if selection == "output": + self.overwrite = ns.output_overwrite + + self.path = getattr(ns, f"{selection}_path") + self.anon = getattr(ns, f"{selection}_anon") + self.bucket = getattr(ns, f"{selection}_bucket") + self.endpoint = getattr(ns, f"{selection}_endpoint") + self.region = getattr(ns, f"{selection}_region") + + if self.bucket: + self.ts_store = { + "driver": "s3", + "bucket": self.bucket, + "aws_region": self.region, + } + if self.anon: + self.ts_store["aws_credentials"] = {"anonymous": self.anon} + if self.endpoint: + self.ts_store["endpoint"] = self.endpoint + + store_class = zarr.store.RemoteStore + self.zr_store = store_class( + url=self.s3_string(), + anon=self.anon, + endpoint_url=self.endpoint, + mode=mode, + ) + + else: + self.ts_store = { + "driver": "file", + } + + store_class = zarr.store.LocalStore + self.zr_store = store_class(self.fs_string(), mode=mode) + + self.ts_store["path"] = self.fs_string() + self.ts_config = { + "driver": "zarr" if selection == "input" else "zarr3", + "kvstore": self.ts_store, + } + + self.zr_group = None + self.zr_attrs = None + + def s3_string(self): + return f"s3://{self.bucket}/{self.fs_string()}" + + def fs_string(self): + return str(self.path / self.subpath) if self.subpath else str(self.path) + + def is_s3(self): + return bool(self.bucket) + + def s3_endpoint(self): + """ + Returns a representation of the S3 endpoint set on this configuration. + + * "" if this is not an S3 configuration + * "default" if no explicit endpoint is set + * otherwise the URL is returned + """ + if self.is_s3(): + if self.endpoint: + return self.endpoint + return "default" + return "" + + def __str__(self): + if self.is_s3(): + return self.s3_string() + return self.fs_string() + + def __repr__(self): + return ( + f"Config<{self.__str__()}, {self.selection}, {self.mode}, {self.overwrite}>" + ) + + def check_or_delete_path(self): + # If this is local, then delete. + if self.bucket: + raise Exception(f"bucket set ({self.bucket}). Refusing to delete.") + + if self.path.exists(): + # TODO: This should really be an option on zarr-python + # as with tensorstore. + if self.overwrite: + if self.path.is_file(): + self.path.unlink() + else: + shutil.rmtree(self.path) + else: + raise Exception( + f"{self.path} exists. Use --output-overwrite to overwrite" + ) + + def open_group(self): + # Needs zarr_format=2 or we get ValueError("store mode does not support writing") + self.zr_group = zarr.open_group(store=self.zr_store, zarr_format=2) + self.zr_attrs = self.zr_group.attrs + + def create_group(self): + self.zr_group = zarr.Group.create(self.zr_store) + self.zr_attrs = self.zr_group.attrs + + def sub_config(self, subpath: str, create_or_open_group: bool = True): + sub = Config( + self.ns, + self.selection, + self.mode, + subpath if not self.subpath else self.subpath / subpath, + ) + if create_or_open_group: + if sub.selection == "input": + sub.open_group() + elif sub.selection == "output": + sub.create_group() + else: + raise Exception(f"unknown selection: {self.selection}") + return sub + + def ts_read(self): + return ts.open(self.ts_config).result() + + def zr_write_text(self, path: Path, text: str): + text = TextBuffer(text) + filename = self.subpath / path if self.subpath else path + sync(self.zr_store.set(str(filename), text)) + + def zr_read_text(self, path: str | Path): + return sync( + self.zr_store.get(str(path), prototype=BufferPrototype(TextBuffer, None)) + ) diff --git a/tests/test_resave.py b/tests/test_resave.py index 12123d3..98f8d98 100644 --- a/tests/test_resave.py +++ b/tests/test_resave.py @@ -2,7 +2,7 @@ import pytest -from ome2024_ngff_challenge import resave +from ome2024_ngff_challenge import dispatch # # Helpers @@ -34,8 +34,10 @@ def all_files(path): def test_bad_chunks(tmp_path): with pytest.raises(SystemExit): - resave.cli( + dispatch( [ + "resave", + "--cc-by", str(tmp_path / "in.zarr"), str(tmp_path / "out.zarr"), "--output-chunks=xxx", @@ -45,8 +47,10 @@ def test_bad_chunks(tmp_path): def test_conflicting_args(tmp_path): with pytest.raises(SystemExit): - resave.cli( + dispatch( [ + "resave", + "--cc-by", str(tmp_path / "in.zarr"), str(tmp_path / "out.zarr"), "--output-chunks=xxx", @@ -62,8 +66,10 @@ def test_conflicting_args(tmp_path): def test_rocrate_name(tmp_path): assert ( - resave.cli( + dispatch( [ + "resave", + "--cc-by", "--rocrate-skip", "data/2d.zarr", str(tmp_path / "out.zarr"), @@ -77,8 +83,10 @@ def test_rocrate_name(tmp_path): def test_rocrate_set_name(tmp_path): assert ( - resave.cli( + dispatch( [ + "resave", + "--cc-by", "--rocrate-name=XXX", "data/2d.zarr", str(tmp_path / "out.zarr"), @@ -95,8 +103,10 @@ def test_rocrate_full_example(tmp_path): organism = "NCBI:txid7227" modality = "obo:FBbi_00000243" assert ( - resave.cli( + dispatch( [ + "resave", + "--cc-by", "--rocrate-name=test name", "--rocrate-description=this should be a full description", f"--rocrate-organism={organism}", @@ -127,8 +137,10 @@ def test_rocrate_full_example(tmp_path): @pytest.mark.skip(reason="too slow") def test_remote_hcs_with_scripts(tmp_path): - resave.cli( + dispatch( [ + "resave", + "--cc-by", *IDR_BUCKET, IDR_PLATE, str(tmp_path / "out.zarr"), @@ -141,8 +153,10 @@ def test_remote_simple_with_download(tmp_path): # The labels for `6001240.zarr` have chunks like [1,59,69,136] which is # not compatible with default shard (whole image, [1,236,275,271]), # so we need to specify both: - resave.cli( + dispatch( [ + "resave", + "--cc-by", *IDR_BUCKET, IDR_3D, str(tmp_path / "out.zarr"), @@ -176,8 +190,10 @@ def check_bf2raw(tmp_path, input, expected, args): ) def test_local_tests(tmp_path, input, expected, args, func): assert ( - resave.cli( + dispatch( [ + "resave", + "--cc-by", *args, f"data/{input}.zarr", str(tmp_path / "out.zarr"),