Download Kaggle datasets (#1487)

### Summary CVS-137601 Adds the downloading logic for Kaggle datasets. 258 pre-defined configurations are implemented. Kaggle API has a request limit, so fetching the metadata for each dataset is still unresolved. ### Checklist  - [x] I have added unit tests to cover my changes. - [ ] I have added integration tests to cover my changes. - [ ] I have added the description of my changes into [CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md). - [ ] I have updated the [documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs) accordingly ### License - [x] I submit _my code changes_ under the same [MIT License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE) that covers the project. Feel free to contact the maintainers if that's a concern. - [x] I have updated the license header for each file (see an example below). ```python # Copyright (C) 2024 Intel Corporation # # SPDX-License-Identifier: MIT ``` --------- Signed-off-by: Ilya Trushkin <[email protected]>
openvinotoolkit · May 7, 2024 · 6e89b4f · 6e89b4f
1 parent 49c594d
commit 6e89b4f
Show file tree

Hide file tree

Showing 11 changed files with 1,837 additions and 202 deletions.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -3,6 +3,7 @@ include requirements-core.txt
 include requirements-default.txt
 include 3rd-party.txt
 include src/datumaro/plugins/specs.json
+include src/datumaro/cli/commands/downloaders/kaggle_formats.json
 
 include rust/Cargo.toml
 recursive-include rust/src *
diff --git a/src/datumaro/cli/commands/download.py b/src/datumaro/cli/commands/download.py
@@ -3,78 +3,45 @@
 # SPDX-License-Identifier: MIT
 
 import argparse
-import contextlib
-import logging as log
-import os
-import os.path as osp
-import sys
 from typing import Dict
 
-from datumaro.components.environment import DEFAULT_ENVIRONMENT
-from datumaro.components.extractor_tfds import (
-    AVAILABLE_TFDS_DATASETS,
-    TFDS_EXTRACTOR_AVAILABLE,
-    TfdsDatasetRemoteMetadata,
-)
-from datumaro.util import dump_json
-from datumaro.util.os_util import make_file_name
-
 from ..util import MultilineFormatter
-from ..util.errors import CliException
-from ..util.project import generate_next_file_name
+from .downloaders import IDatasetDownloader, KaggleDatasetDownloader, TfdsDatasetDownloader
 
 
 def build_parser(parser_ctor=argparse.ArgumentParser):
     parser = parser_ctor(
         help="Download a publicly available dataset",
-        description="""
+        description=f"""
         Downloads a publicly available dataset and saves it in a given format.|n
         |n
-        Currently, the only source of datasets is the TensorFlow Datasets
-        library. Therefore, to use this command you must install TensorFlow &
-        TFDS, which you can do as follows:|n
-        |n
-        |s|spip install datumaro[tf,tfds]|n
-        |n
-        To download the dataset, run "datum download run". On the other hand,
-        for information about the datasets, run "datum download describe".
+        To download the dataset, run "datum download <dataset_type> get". On the other hand,
+        for information about the datasets, run "datum download <dataset_type> describe".
+        Supported dataset types are: {list(DOWNLOADERS.keys())}
         """,
         formatter_class=MultilineFormatter,
     )
-    subparsers = parser.add_subparsers(title="Commands")
-
-    build_get_subparser(subparsers)
-    build_describe_subparser(subparsers)
-
+    subparsers = parser.add_subparsers(title="Dataset types")
+    for name, downloader in DOWNLOADERS.items():
+        dataset_type_parser = subparsers.add_parser(
+            name=name,
+            help=f"Download {name} dataset",
+            formatter_class=MultilineFormatter,
+        )
+        _subparsers = dataset_type_parser.add_subparsers(title="Commands")
+        build_get_subparser(_subparsers, name, downloader)
+        build_describe_subparser(_subparsers, name, downloader)
 
-def build_get_subparser(subparsers: argparse._SubParsersAction):
-    builtin_writers = sorted(DEFAULT_ENVIRONMENT.exporters)
-    if TFDS_EXTRACTOR_AVAILABLE:
-        available_datasets = ", ".join(f"tfds:{name}" for name in AVAILABLE_TFDS_DATASETS)
-    else:
-        available_datasets = "N/A (TensorFlow and/or TensorFlow Datasets " "are not installed)"
 
+def build_get_subparser(
+    subparsers: argparse._SubParsersAction, name: str, downloader: IDatasetDownloader
+):
     parser = subparsers.add_parser(
         name="get",
         help="Download a publicly available dataset",
-        description="""
-        Supported datasets: {}|n
-        |n
-        Supported output formats: {}|n
-        |n
-        Examples:|n
-        - Download the MNIST dataset:|n
-        |s|s%(prog)s -i tfds:mnist -- --save-media|n
-        |n
-        - Download the VOC 2012 dataset, saving only the annotations in the COCO
-          format into a specific directory:|n
-        |s|s%(prog)s -i tfds:voc/2012 -f coco -o path/I/like/
-        """.format(
-            available_datasets, ", ".join(builtin_writers)
-        ),
+        description=downloader.get_command_description(),
         formatter_class=MultilineFormatter,
     )
-
     parser.add_argument("-i", "--dataset-id", required=True, help="Which dataset to download")
     parser.add_argument(
         "-f", "--output-format", help="Output format (default: original format of the dataset)"
@@ -96,22 +63,22 @@ def build_get_subparser(subparsers: argparse._SubParsersAction):
         "Must be specified after the main command arguments",
     )
 
-    parser.set_defaults(command=download_command)
+    parser.set_defaults(command=download_command, downloader=downloader)
 
     return parser
 
 
-def build_describe_subparser(subparsers: argparse._SubParsersAction):
+def build_describe_subparser(
+    subparsers: argparse._SubParsersAction, name: str, downloader: IDatasetDownloader
+):
     parser = subparsers.add_parser(
         name="describe",
         help="Print information about downloadable datasets",
-        description="""
+        description=f"""
         Reports information about datasets that can be downloaded with the
-        "datum download" command. The information is reported either as
-        human-readable text (the default) or as a JSON object. More detailed
-        information can be found in the TFDS Catalog:
-        <https://www.tensorflow.org/datasets/catalog/overview>.
-        """,
+        "datum download {name}" command. The information is reported either as
+        human-readable text (the default) or as a JSON object."""
+        + downloader.describe_command_description(),
         formatter_class=MultilineFormatter,
     )
 
@@ -124,7 +91,7 @@ def build_describe_subparser(subparsers: argparse._SubParsersAction):
     parser.add_argument(
         "--report-file", help="File to which to write the report (default: standard output)"
     )
-    parser.set_defaults(command=describe_downloads_command)
+    parser.set_defaults(command=describe_downloads_command, downloader=downloader)
 
     return parser
 
@@ -136,144 +103,22 @@ def get_sensitive_args():
     }
 
 
-def download_command(args):
-    env = DEFAULT_ENVIRONMENT
-
-    if args.dataset_id.startswith("tfds:"):
-        if TFDS_EXTRACTOR_AVAILABLE:
-            tfds_ds_name = args.dataset_id[5:]
-            tfds_ds = AVAILABLE_TFDS_DATASETS.get(tfds_ds_name)
-            if tfds_ds:
-                default_output_format = tfds_ds.metadata.default_output_format
-                extractor_factory = tfds_ds.make_extractor
-            else:
-                raise CliException(f"Unsupported TFDS dataset '{tfds_ds_name}'")
-        else:
-            raise CliException(
-                "TFDS datasets are not available, because TFDS and/or "
-                "TensorFlow are not installed.\n"
-                "You can install them with: pip install datumaro[tf,tfds]"
-            )
-    else:
-        raise CliException(f"Unknown dataset ID '{args.dataset_id}'")
-
-    output_format = args.output_format or default_output_format
-
-    try:
-        exporter = env.exporters[output_format]
-    except KeyError:
-        raise CliException("Exporter for format '%s' is not found" % output_format)
-    extra_args = exporter.parse_cmdline(args.extra_args)
-
-    dst_dir = args.dst_dir
-    if dst_dir:
-        if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
-            raise CliException(
-                "Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir
-            )
-    else:
-        dst_dir = generate_next_file_name(
-            "%s-%s"
-            % (
-                make_file_name(args.dataset_id),
-                make_file_name(output_format),
-            )
-        )
-    dst_dir = osp.abspath(dst_dir)
+DOWNLOADERS: Dict[str, IDatasetDownloader] = {
+    "tfds": TfdsDatasetDownloader,
+    "kaggle": KaggleDatasetDownloader,
+}
 
-    log.info("Downloading the dataset")
-    extractor = extractor_factory()
 
-    if args.subset:
-        try:
-            extractor = extractor.subsets()[args.subset]
-        except KeyError:
-            raise CliException("Subset '%s' is not present in the dataset" % args.subset)
-
-    log.info("Exporting the dataset")
-    exporter.convert(extractor, dst_dir, default_image_ext=".png", **extra_args)
-
-    log.info("Dataset exported to '%s' as '%s'" % (dst_dir, output_format))
+def download_command(args):
+    args.downloader.download(
+        args.dataset_id,
+        args.dst_dir,
+        args.overwrite,
+        args.output_format,
+        args.subset,
+        args.extra_args,
+    )
 
 
 def describe_downloads_command(args):
-    dataset_metas: Dict[str, TfdsDatasetRemoteMetadata] = {}
-
-    if TFDS_EXTRACTOR_AVAILABLE:
-        for dataset_name, dataset in AVAILABLE_TFDS_DATASETS.items():
-            dataset_metas[f"tfds:{dataset_name}"] = dataset.query_remote_metadata()
-
-    if args.report_format == "text":
-        with open(
-            args.report_file, "w"
-        ) if args.report_file else contextlib.nullcontext() as report_file:
-            if dataset_metas:
-                print("Available datasets:", file=report_file)
-
-                for name, meta in sorted(dataset_metas.items()):
-                    print(file=report_file)
-                    print(f"{name} ({meta.human_name}):", file=report_file)
-                    print(
-                        f"  default output format: {meta.default_output_format}",
-                        file=report_file,
-                    )
-
-                    print("  description:", file=report_file)
-                    for line in meta.description.rstrip("\n").split("\n"):
-                        print(f"    {line}", file=report_file)
-
-                    print(f"  download size: {meta.download_size} bytes", file=report_file)
-                    print(f"  home URL: {meta.home_url or 'N/A'}", file=report_file)
-                    print(f"  number of classes: {meta.num_classes}", file=report_file)
-                    print("  subsets:", file=report_file)
-                    for subset_name, subset_meta in sorted(meta.subsets.items()):
-                        print(f"    {subset_name}: {subset_meta.num_items} items", file=report_file)
-                    print(f"  version: {meta.version}", file=report_file)
-            else:
-                print("No datasets available.", file=report_file)
-                print(file=report_file)
-                print(
-                    "You can enable TFDS datasets by installing "
-                    "TensorFlow and TensorFlow Datasets:",
-                    file=report_file,
-                )
-                print("    pip install datumaro[tf,tfds]", file=report_file)
-
-    elif args.report_format == "json":
-
-        def meta_to_raw(meta: TfdsDatasetRemoteMetadata):
-            raw = {}
-
-            # We omit the media type from the output, because there is currently no mechanism
-            # for mapping media types to strings. The media type could be useful information
-            # for users, though, so we might want to implement such a mechanism eventually.
-
-            for attribute in (
-                "default_output_format",
-                "description",
-                "download_size",
-                "home_url",
-                "human_name",
-                "num_classes",
-                "version",
-            ):
-                raw[attribute] = getattr(meta, attribute)
-
-            raw["subsets"] = {
-                name: {"num_items": subset.num_items} for name, subset in meta.subsets.items()
-            }
-
-            return raw
-
-        with (
-            open(args.report_file, "w") if args.report_file else contextlib.nullcontext(sys.stdout)
-        ) as report_file:
-            report_file.write(
-                dump_json(
-                    {name: meta_to_raw(meta) for name, meta in dataset_metas.items()},
-                    indent=True,
-                    append_newline=True,
-                ).decode()
-            )
-    else:
-        assert False, "unreachable code"
+    return args.downloader.describe(args.report_format, args.report_file)
diff --git a/src/datumaro/cli/commands/downloaders/__init__.py b/src/datumaro/cli/commands/downloaders/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from .downloader import IDatasetDownloader
+from .kaggle import KaggleDatasetDownloader
+from .tfds import TfdsDatasetDownloader
+
+__all__ = [IDatasetDownloader, KaggleDatasetDownloader, TfdsDatasetDownloader]
diff --git a/src/datumaro/cli/commands/downloaders/downloader.py b/src/datumaro/cli/commands/downloaders/downloader.py
@@ -0,0 +1,31 @@
+# Copyright (C) 2024 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+from typing import Any
+
+
+class IDatasetDownloader:
+    @classmethod
+    def download(
+        cls,
+        dataset_id: str,
+        dst_dir: str,
+        overwrite: bool,
+        output_format: str,
+        subset: str,
+        extra_args: Any,
+    ):
+        raise NotImplementedError()
+
+    @classmethod
+    def describe(cls, report_format, report_file=None) -> str:
+        raise NotImplementedError()
+
+    @classmethod
+    def get_command_description(cls, *args, **kwargs) -> str:
+        raise NotImplementedError()
+
+    @classmethod
+    def describe_command_description(cls):
+        raise NotImplementedError()