Skip to content

Commit

Permalink
Download Kaggle datasets (#1487)
Browse files Browse the repository at this point in the history
<!-- Contributing guide:
https://github.com/openvinotoolkit/datumaro/blob/develop/CONTRIBUTING.md
-->

### Summary


CVS-137601
Adds the downloading logic for Kaggle datasets.
258 pre-defined configurations are implemented.

Kaggle API has a request limit, so fetching the metadata for each
dataset is still unresolved.

### Checklist
<!-- Put an 'x' in all the boxes that apply -->
- [x] I have added unit tests to cover my changes.​
- [ ] I have added integration tests to cover my changes.​
- [ ] I have added the description of my changes into
[CHANGELOG](https://github.com/openvinotoolkit/datumaro/blob/develop/CHANGELOG.md).​
- [ ] I have updated the
[documentation](https://github.com/openvinotoolkit/datumaro/tree/develop/docs)
accordingly

### License

- [x] I submit _my code changes_ under the same [MIT
License](https://github.com/openvinotoolkit/datumaro/blob/develop/LICENSE)
that covers the project.
  Feel free to contact the maintainers if that's a concern.
- [x] I have updated the license header for each file (see an example
below).

```python
# Copyright (C) 2024 Intel Corporation
#
# SPDX-License-Identifier: MIT
```

---------

Signed-off-by: Ilya Trushkin <[email protected]>
  • Loading branch information
itrushkin authored May 7, 2024
1 parent 49c594d commit 6e89b4f
Show file tree
Hide file tree
Showing 11 changed files with 1,837 additions and 202 deletions.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ include requirements-core.txt
include requirements-default.txt
include 3rd-party.txt
include src/datumaro/plugins/specs.json
include src/datumaro/cli/commands/downloaders/kaggle_formats.json

include rust/Cargo.toml
recursive-include rust/src *
239 changes: 42 additions & 197 deletions src/datumaro/cli/commands/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,78 +3,45 @@
# SPDX-License-Identifier: MIT

import argparse
import contextlib
import logging as log
import os
import os.path as osp
import sys
from typing import Dict

from datumaro.components.environment import DEFAULT_ENVIRONMENT
from datumaro.components.extractor_tfds import (
AVAILABLE_TFDS_DATASETS,
TFDS_EXTRACTOR_AVAILABLE,
TfdsDatasetRemoteMetadata,
)
from datumaro.util import dump_json
from datumaro.util.os_util import make_file_name

from ..util import MultilineFormatter
from ..util.errors import CliException
from ..util.project import generate_next_file_name
from .downloaders import IDatasetDownloader, KaggleDatasetDownloader, TfdsDatasetDownloader


def build_parser(parser_ctor=argparse.ArgumentParser):
parser = parser_ctor(
help="Download a publicly available dataset",
description="""
description=f"""
Downloads a publicly available dataset and saves it in a given format.|n
|n
Currently, the only source of datasets is the TensorFlow Datasets
library. Therefore, to use this command you must install TensorFlow &
TFDS, which you can do as follows:|n
|n
|s|spip install datumaro[tf,tfds]|n
|n
To download the dataset, run "datum download run". On the other hand,
for information about the datasets, run "datum download describe".
To download the dataset, run "datum download <dataset_type> get". On the other hand,
for information about the datasets, run "datum download <dataset_type> describe".
Supported dataset types are: {list(DOWNLOADERS.keys())}
""",
formatter_class=MultilineFormatter,
)
subparsers = parser.add_subparsers(title="Commands")

build_get_subparser(subparsers)
build_describe_subparser(subparsers)

subparsers = parser.add_subparsers(title="Dataset types")
for name, downloader in DOWNLOADERS.items():
dataset_type_parser = subparsers.add_parser(
name=name,
help=f"Download {name} dataset",
formatter_class=MultilineFormatter,
)
_subparsers = dataset_type_parser.add_subparsers(title="Commands")
build_get_subparser(_subparsers, name, downloader)
build_describe_subparser(_subparsers, name, downloader)

def build_get_subparser(subparsers: argparse._SubParsersAction):
builtin_writers = sorted(DEFAULT_ENVIRONMENT.exporters)
if TFDS_EXTRACTOR_AVAILABLE:
available_datasets = ", ".join(f"tfds:{name}" for name in AVAILABLE_TFDS_DATASETS)
else:
available_datasets = "N/A (TensorFlow and/or TensorFlow Datasets " "are not installed)"

def build_get_subparser(
subparsers: argparse._SubParsersAction, name: str, downloader: IDatasetDownloader
):
parser = subparsers.add_parser(
name="get",
help="Download a publicly available dataset",
description="""
Supported datasets: {}|n
|n
Supported output formats: {}|n
|n
Examples:|n
- Download the MNIST dataset:|n
|s|s%(prog)s -i tfds:mnist -- --save-media|n
|n
- Download the VOC 2012 dataset, saving only the annotations in the COCO
format into a specific directory:|n
|s|s%(prog)s -i tfds:voc/2012 -f coco -o path/I/like/
""".format(
available_datasets, ", ".join(builtin_writers)
),
description=downloader.get_command_description(),
formatter_class=MultilineFormatter,
)

parser.add_argument("-i", "--dataset-id", required=True, help="Which dataset to download")
parser.add_argument(
"-f", "--output-format", help="Output format (default: original format of the dataset)"
Expand All @@ -96,22 +63,22 @@ def build_get_subparser(subparsers: argparse._SubParsersAction):
"Must be specified after the main command arguments",
)

parser.set_defaults(command=download_command)
parser.set_defaults(command=download_command, downloader=downloader)

return parser


def build_describe_subparser(subparsers: argparse._SubParsersAction):
def build_describe_subparser(
subparsers: argparse._SubParsersAction, name: str, downloader: IDatasetDownloader
):
parser = subparsers.add_parser(
name="describe",
help="Print information about downloadable datasets",
description="""
description=f"""
Reports information about datasets that can be downloaded with the
"datum download" command. The information is reported either as
human-readable text (the default) or as a JSON object. More detailed
information can be found in the TFDS Catalog:
<https://www.tensorflow.org/datasets/catalog/overview>.
""",
"datum download {name}" command. The information is reported either as
human-readable text (the default) or as a JSON object."""
+ downloader.describe_command_description(),
formatter_class=MultilineFormatter,
)

Expand All @@ -124,7 +91,7 @@ def build_describe_subparser(subparsers: argparse._SubParsersAction):
parser.add_argument(
"--report-file", help="File to which to write the report (default: standard output)"
)
parser.set_defaults(command=describe_downloads_command)
parser.set_defaults(command=describe_downloads_command, downloader=downloader)

return parser

Expand All @@ -136,144 +103,22 @@ def get_sensitive_args():
}


def download_command(args):
env = DEFAULT_ENVIRONMENT

if args.dataset_id.startswith("tfds:"):
if TFDS_EXTRACTOR_AVAILABLE:
tfds_ds_name = args.dataset_id[5:]
tfds_ds = AVAILABLE_TFDS_DATASETS.get(tfds_ds_name)
if tfds_ds:
default_output_format = tfds_ds.metadata.default_output_format
extractor_factory = tfds_ds.make_extractor
else:
raise CliException(f"Unsupported TFDS dataset '{tfds_ds_name}'")
else:
raise CliException(
"TFDS datasets are not available, because TFDS and/or "
"TensorFlow are not installed.\n"
"You can install them with: pip install datumaro[tf,tfds]"
)
else:
raise CliException(f"Unknown dataset ID '{args.dataset_id}'")

output_format = args.output_format or default_output_format

try:
exporter = env.exporters[output_format]
except KeyError:
raise CliException("Exporter for format '%s' is not found" % output_format)
extra_args = exporter.parse_cmdline(args.extra_args)

dst_dir = args.dst_dir
if dst_dir:
if not args.overwrite and osp.isdir(dst_dir) and os.listdir(dst_dir):
raise CliException(
"Directory '%s' already exists " "(pass --overwrite to overwrite)" % dst_dir
)
else:
dst_dir = generate_next_file_name(
"%s-%s"
% (
make_file_name(args.dataset_id),
make_file_name(output_format),
)
)
dst_dir = osp.abspath(dst_dir)
DOWNLOADERS: Dict[str, IDatasetDownloader] = {
"tfds": TfdsDatasetDownloader,
"kaggle": KaggleDatasetDownloader,
}

log.info("Downloading the dataset")
extractor = extractor_factory()

if args.subset:
try:
extractor = extractor.subsets()[args.subset]
except KeyError:
raise CliException("Subset '%s' is not present in the dataset" % args.subset)

log.info("Exporting the dataset")
exporter.convert(extractor, dst_dir, default_image_ext=".png", **extra_args)

log.info("Dataset exported to '%s' as '%s'" % (dst_dir, output_format))
def download_command(args):
args.downloader.download(
args.dataset_id,
args.dst_dir,
args.overwrite,
args.output_format,
args.subset,
args.extra_args,
)


def describe_downloads_command(args):
dataset_metas: Dict[str, TfdsDatasetRemoteMetadata] = {}

if TFDS_EXTRACTOR_AVAILABLE:
for dataset_name, dataset in AVAILABLE_TFDS_DATASETS.items():
dataset_metas[f"tfds:{dataset_name}"] = dataset.query_remote_metadata()

if args.report_format == "text":
with open(
args.report_file, "w"
) if args.report_file else contextlib.nullcontext() as report_file:
if dataset_metas:
print("Available datasets:", file=report_file)

for name, meta in sorted(dataset_metas.items()):
print(file=report_file)
print(f"{name} ({meta.human_name}):", file=report_file)
print(
f" default output format: {meta.default_output_format}",
file=report_file,
)

print(" description:", file=report_file)
for line in meta.description.rstrip("\n").split("\n"):
print(f" {line}", file=report_file)

print(f" download size: {meta.download_size} bytes", file=report_file)
print(f" home URL: {meta.home_url or 'N/A'}", file=report_file)
print(f" number of classes: {meta.num_classes}", file=report_file)
print(" subsets:", file=report_file)
for subset_name, subset_meta in sorted(meta.subsets.items()):
print(f" {subset_name}: {subset_meta.num_items} items", file=report_file)
print(f" version: {meta.version}", file=report_file)
else:
print("No datasets available.", file=report_file)
print(file=report_file)
print(
"You can enable TFDS datasets by installing "
"TensorFlow and TensorFlow Datasets:",
file=report_file,
)
print(" pip install datumaro[tf,tfds]", file=report_file)

elif args.report_format == "json":

def meta_to_raw(meta: TfdsDatasetRemoteMetadata):
raw = {}

# We omit the media type from the output, because there is currently no mechanism
# for mapping media types to strings. The media type could be useful information
# for users, though, so we might want to implement such a mechanism eventually.

for attribute in (
"default_output_format",
"description",
"download_size",
"home_url",
"human_name",
"num_classes",
"version",
):
raw[attribute] = getattr(meta, attribute)

raw["subsets"] = {
name: {"num_items": subset.num_items} for name, subset in meta.subsets.items()
}

return raw

with (
open(args.report_file, "w") if args.report_file else contextlib.nullcontext(sys.stdout)
) as report_file:
report_file.write(
dump_json(
{name: meta_to_raw(meta) for name, meta in dataset_metas.items()},
indent=True,
append_newline=True,
).decode()
)
else:
assert False, "unreachable code"
return args.downloader.describe(args.report_format, args.report_file)
9 changes: 9 additions & 0 deletions src/datumaro/cli/commands/downloaders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (C) 2024 Intel Corporation
#
# SPDX-License-Identifier: MIT

from .downloader import IDatasetDownloader
from .kaggle import KaggleDatasetDownloader
from .tfds import TfdsDatasetDownloader

__all__ = [IDatasetDownloader, KaggleDatasetDownloader, TfdsDatasetDownloader]
31 changes: 31 additions & 0 deletions src/datumaro/cli/commands/downloaders/downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (C) 2024 Intel Corporation
#
# SPDX-License-Identifier: MIT

from typing import Any


class IDatasetDownloader:
@classmethod
def download(
cls,
dataset_id: str,
dst_dir: str,
overwrite: bool,
output_format: str,
subset: str,
extra_args: Any,
):
raise NotImplementedError()

@classmethod
def describe(cls, report_format, report_file=None) -> str:
raise NotImplementedError()

@classmethod
def get_command_description(cls, *args, **kwargs) -> str:
raise NotImplementedError()

@classmethod
def describe_command_description(cls):
raise NotImplementedError()
Loading

0 comments on commit 6e89b4f

Please sign in to comment.