Skip to content

Commit

Permalink
[CodeCamp2023-341] 多模态数据集文档补充-COCO Retrieval
Browse files Browse the repository at this point in the history
  • Loading branch information
ASHORE1225 authored Oct 8, 2023
1 parent 06bb586 commit 3bcf7e2
Showing 1 changed file with 73 additions and 2 deletions.
75 changes: 73 additions & 2 deletions mmpretrain/datasets/coco_retrieval.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,45 @@
# Copyright (c) OpenMMLab. All rights reserved.
import json
import os.path as osp
from collections import OrderedDict
from typing import List
from os import PathLike
from typing import List, Sequence, Union

from mmengine import get_file_backend

from mmpretrain.registry import DATASETS
from mmpretrain.registry import DATASETS, TRANSFORMS
from .base_dataset import BaseDataset


def expanduser(data_prefix):
if isinstance(data_prefix, (str, PathLike)):
return osp.expanduser(data_prefix)
else:
return data_prefix


@DATASETS.register_module()
class COCORetrieval(BaseDataset):
"""COCO Retrieval dataset.
COCO (Common Objects in Context): The COCO dataset contains more than
330K images,each of which has approximately 5 descriptive annotations.
This dataset was releasedin collaboration between Microsoft and Carnegie
Mellon University
COCO_2014 dataset directory: ::
COCO_2014
├── val2014
├── train2014
├── annotations
├── instances_train2014.json
├── instances_val2014.json
├── person_keypoints_train2014.json
├── person_keypoints_val2014.json
├── captions_train2014.json
├── captions_val2014.json
Args:
ann_file (str): Annotation file path.
test_mode (bool): Whether dataset is used for evaluation. This will
Expand All @@ -23,8 +50,52 @@ class COCORetrieval(BaseDataset):
data_prefix (str | dict): Prefix for training data. Defaults to ''.
pipeline (Sequence): Processing pipeline. Defaults to an empty tuple.
**kwargs: Other keyword arguments in :class:`BaseDataset`.
Examples:
>>> from mmpretrain.datasets import COCORetrieval
>>> train_dataset=COCORetrieval(data_root='coco2014/')
>>> train_dataset
Dataset COCORetrieval
Number of samples: 414113
Annotation file: /coco2014/annotations/captions_train2014.json
Prefix of images: /coco2014/
>>> from mmpretrain.datasets import COCORetrieval
>>> val_dataset = COCORetrieval(data_root='coco2014/')
>>> val_dataset
Dataset COCORetrieval
Number of samples: 202654
Annotation file: /coco2014/annotations/captions_val2014.json
Prefix of images: /coco2014/
"""

def __init__(self,
ann_file: str,
test_mode: bool = False,
data_prefix: Union[str, dict] = '',
data_root: str = '',
pipeline: Sequence = (),
**kwargs):

if isinstance(data_prefix, str):
data_prefix = dict(img_path=expanduser(data_prefix))

ann_file = expanduser(ann_file)
transforms = []
for transform in pipeline:
if isinstance(transform, dict):
transforms.append(TRANSFORMS.build(transform))
else:
transforms.append(transform)

super().__init__(
data_root=data_root,
data_prefix=data_prefix,
test_mode=test_mode,
pipeline=transforms,
ann_file=ann_file,
**kwargs,
)

def load_data_list(self) -> List[dict]:
"""Load data list."""
# get file backend
Expand Down

0 comments on commit 3bcf7e2

Please sign in to comment.