From 52fe50de105a111d8f6f019aa26e900009fc64a6 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 8 Jul 2024 17:03:39 +0100 Subject: [PATCH 01/17] Fix validation and test split not being reproducible --- crabs/detector/datamodules.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crabs/detector/datamodules.py b/crabs/detector/datamodules.py index 1e6d18a4..30505efb 100644 --- a/crabs/detector/datamodules.py +++ b/crabs/detector/datamodules.py @@ -170,9 +170,10 @@ def _compute_splits( """ # Optionally fix the generator for a reproducible split of data - generator = None + generator_1, generator_2 = None, None if self.split_seed: - generator = torch.Generator().manual_seed(self.split_seed) + generator_1 = torch.Generator().manual_seed(self.split_seed) + generator_2 = torch.Generator().manual_seed(self.split_seed) # Create dataset (combining all datasets passed) full_dataset = CrabsCocoDetection( @@ -189,7 +190,7 @@ def _compute_splits( train_dataset, test_val_dataset = random_split( full_dataset, [self.config["train_fraction"], 1 - self.config["train_fraction"]], - generator=generator, + generator=generator_1, ) # Split test/val sets from the remainder @@ -199,6 +200,7 @@ def _compute_splits( 1 - self.config["val_over_test_fraction"], self.config["val_over_test_fraction"], ], + generator=generator_2, ) return train_dataset, test_dataset, val_dataset From 0b2c7b2711f9bd75e25ea921a78b8dd4738cc1ed Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Fri, 18 Oct 2024 20:38:40 +0200 Subject: [PATCH 02/17] Check-manifest fix --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e16d66da..5632813f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,8 +30,8 @@ repos: additional_dependencies: - types-setuptools - repo: https://github.com/mgedmin/check-manifest - rev: "0.49" + rev: "0.50" hooks: - id: check-manifest args: [--no-build-isolation] - additional_dependencies: [setuptools-scm] + additional_dependencies: [setuptools-scm, wheel] From 26f2b751d1a032bfc724cf63cab6b5f836baeb33 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Fri, 18 Oct 2024 20:38:58 +0200 Subject: [PATCH 03/17] Draft dataset split tests --- crabs/detector/datamodules.py | 2 +- tests/test_unit/test_datamodules.py | 181 +++++++++++++++++++++------- 2 files changed, 139 insertions(+), 44 deletions(-) diff --git a/crabs/detector/datamodules.py b/crabs/detector/datamodules.py index 30505efb..86e2cfec 100644 --- a/crabs/detector/datamodules.py +++ b/crabs/detector/datamodules.py @@ -218,9 +218,9 @@ def setup(self, stage: str): Define the transforms for each split of the data and compute them. """ # Assign transforms + self.train_transform = self._get_train_transform() # right now assuming validation and test get the same transforms test_and_val_transform = self._get_test_val_transform() - self.train_transform = self._get_train_transform() self.test_transform = test_and_val_transform self.val_transform = test_and_val_transform diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index c5ec4080..9e2c40dd 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -24,6 +24,39 @@ def default_train_config(): return yaml.safe_load(f) +@pytest.fixture +def crabs_data_module_realistic(default_train_config): + return CrabsDataModule( + list_img_dirs=["dir1"], + list_annotation_files=["anno1"], + config=default_train_config, + split_seed=123, + no_data_augmentation=False, + ) + + +@pytest.fixture +def crabs_data_module_with_data_augm(default_train_config): + return CrabsDataModule( + list_img_dirs=["dir1", "dir2"], + list_annotation_files=["anno1", "anno2"], + config=default_train_config, + split_seed=123, + no_data_augmentation=False, + ) + + +@pytest.fixture +def crabs_data_module_without_data_augm(default_train_config): + return CrabsDataModule( + list_img_dirs=["dir1", "dir2"], + list_annotation_files=["anno1", "anno2"], + config=default_train_config, + split_seed=123, + no_data_augmentation=True, + ) + + @pytest.fixture def expected_data_augm_transforms(): return transforms.Compose( @@ -58,28 +91,6 @@ def expected_no_data_augm_transforms(): ) -@pytest.fixture -def crabs_data_module_with_data_augm(default_train_config): - return CrabsDataModule( - list_img_dirs=["dir1", "dir2"], - list_annotation_files=["anno1", "anno2"], - config=default_train_config, - split_seed=123, - no_data_augmentation=False, - ) - - -@pytest.fixture -def crabs_data_module_without_data_augm(default_train_config): - return CrabsDataModule( - list_img_dirs=["dir1", "dir2"], - list_annotation_files=["anno1", "anno2"], - config=default_train_config, - split_seed=123, - no_data_augmentation=True, - ) - - def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip): """Compare the attributes of two transforms excluding those in list.""" @@ -98,6 +109,27 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip): return transform1_attrs_without_fns == transform2_attrs_without_fns +@pytest.fixture +def dummy_dataset(): + """Create dummy images and annotations for testing.""" + num_samples = 5 + images = [torch.randn(3, 256, 256) for _ in range(num_samples)] + annotations = [] + for _ in range(num_samples): + # Generate random number of bounding boxes for each image + num_boxes = random.randint(1, 5) + boxes = [] + for _ in range(num_boxes): + # Generate random bounding box coordinates within image size + x_min = random.randint(0, 200) + y_min = random.randint(0, 200) + x_max = random.randint(x_min + 10, 256) + y_max = random.randint(y_min + 10, 256) + boxes.append([x_min, y_min, x_max, y_max]) + annotations.append(torch.tensor(boxes)) + return images, annotations + + @pytest.mark.parametrize( "crabs_data_module, expected_train_transforms", [ @@ -111,6 +143,7 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip): def test_get_train_transform( crabs_data_module, expected_train_transforms, request ): + """Test transforms linked to training set are as expected""" crabs_data_module = request.getfixturevalue(crabs_data_module) expected_train_transforms = request.getfixturevalue( expected_train_transforms @@ -150,6 +183,7 @@ def test_get_train_transform( def test_get_test_val_transform( crabs_data_module, expected_test_val_transforms, request ): + """Test transforms linked to test and validation sets are as expected""" crabs_data_module = request.getfixturevalue(crabs_data_module) expected_test_val_transforms = request.getfixturevalue( expected_test_val_transforms @@ -167,27 +201,6 @@ def test_get_test_val_transform( assert test_val_tr.__dict__ == expected_test_val_tr.__dict__ -@pytest.fixture -def dummy_dataset(): - """Create dummy images and annotations for testing.""" - num_samples = 5 - images = [torch.randn(3, 256, 256) for _ in range(num_samples)] - annotations = [] - for _ in range(num_samples): - # Generate random number of bounding boxes for each image - num_boxes = random.randint(1, 5) - boxes = [] - for _ in range(num_boxes): - # Generate random bounding box coordinates within image size - x_min = random.randint(0, 200) - y_min = random.randint(0, 200) - x_max = random.randint(x_min + 10, 256) - y_max = random.randint(y_min + 10, 256) - boxes.append([x_min, y_min, x_max, y_max]) - annotations.append(torch.tensor(boxes)) - return images, annotations - - @pytest.mark.parametrize( "crabs_data_module", [ @@ -210,3 +223,85 @@ def test_collate_fn(crabs_data_module, dummy_dataset, request): image, annotation = sample assert torch.equal(image, dummy_dataset[0][i]) assert torch.equal(annotation, dummy_dataset[1][i]) + + +@pytest.mark.parametrize( + "seed, expected_indices", + [ + (123, {"train": [1, 2, 3], "test": [1, 2, 3], "val": [1, 2, 3]}), + (42, {"train": [1, 2, 3], "test": [1, 2, 3], "val": [1, 2, 3]}), + ], +) +def test_compute_splits( + seed, + expected_indices, + crabs_data_module_realistic, +): + """Test dataset splits are reproducible and according to the requested fraction""" + + # Get transforms + dm = crabs_data_module_realistic + train_transform = dm._get_test_val_transform() + test_and_val_transform = dm._get_test_val_transform() + + # Compute splits + train_dataset, _, _ = dm._compute_splits(train_transform) + _, test_dataset, val_dataset = dm._compute_splits(test_and_val_transform) + + # Check split sizes are as expected + total_dataset_size = ( + len(train_dataset) + len(test_dataset) + len(val_dataset) + ) + assert total_dataset_size == 50 + # TODO: change to np.isclose + assert len(train_dataset) / total_dataset_size == 0.8 + assert len(test_dataset) / total_dataset_size == 0.1 + assert len(val_dataset) / total_dataset_size == 0.1 + + # Check splits are non-overlapping in image IDs + # --- I cannot do this because samples are tuple(image, annotation) + # assert len(set(train_dataset) & set(test_dataset)) == 0 + # assert len(set(train_dataset) & set(val_dataset)) == 0 + # assert len(set(test_dataset) & set(val_dataset)) == 0 + # assert len(set(train_dataset) & set(test_dataset)) == 0 + + # Compute lists of image IDs per dataset + image_ids_per_dataset = {} + for dataset, dataset_str in zip( + [train_dataset, test_dataset, val_dataset], ["train", "test", "val"] + ): + image_ids_per_dataset[dataset_str] = [ + sample[1]["image_id"] for sample in dataset + ] + + # Check splits are non-overlapping in image IDs + # TODO: Can I improve this? it is v slow + assert ( + len( + set(image_ids_per_dataset["train"]) + & set(image_ids_per_dataset["test"]) + ) + == 0 + ) + assert ( + len( + set(image_ids_per_dataset["train"]) + & set(image_ids_per_dataset["val"]) + ) + == 0 + ) + assert ( + len( + set(image_ids_per_dataset["test"]) + & set(image_ids_per_dataset["val"]) + ) + == 0 + ) + + # Check splits are reproducible + # we check we always get the same indices from the dataset + # we input to `random_split` given the same seed + # Note that the indices are not the same as the image IDs! + assert train_dataset.indices[:3] == expected_indices["train"] + assert test_dataset.indices[:3] == expected_indices["test"] + assert val_dataset.indices[:3] == expected_indices["val"] From 17501d3f0ed0eda9d0c1062ccd16ea0b9a581de5 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 21 Oct 2024 10:36:21 +0100 Subject: [PATCH 04/17] Simplify dummy dataset fixture --- tests/test_unit/test_datamodules.py | 32 +++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 9e2c40dd..7b2e5f7e 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -111,20 +111,30 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip): @pytest.fixture def dummy_dataset(): - """Create dummy images and annotations for testing.""" - num_samples = 5 - images = [torch.randn(3, 256, 256) for _ in range(num_samples)] + """Create dummy images and annotations for testing. + + The dataset consists of 5 images, with a random number of bounding boxes + per image. The bounding boxes have fixed width and height, but their location + is randomised. + """ + n_images = 5 + img_size = 256 + fixed_width_height = 10 + + images = [torch.randn(3, img_size, img_size) for _ in range(n_images)] annotations = [] - for _ in range(num_samples): + for _ in range(n_images): # Generate random number of bounding boxes for each image - num_boxes = random.randint(1, 5) + n_bboxes = random.randint(1, 5) boxes = [] - for _ in range(num_boxes): - # Generate random bounding box coordinates within image size - x_min = random.randint(0, 200) - y_min = random.randint(0, 200) - x_max = random.randint(x_min + 10, 256) - y_max = random.randint(y_min + 10, 256) + for _ in range(n_bboxes): + # Randomise the location of the top left corner of the bounding box + x_min = random.randint(0, img_size - fixed_width_height) + y_min = random.randint(0, img_size - fixed_width_height) + + # Add fixed width and height to get the bottom right corner + x_max = x_min + fixed_width_height + y_max = y_min + fixed_width_height boxes.append([x_min, y_min, x_max, y_max]) annotations.append(torch.tensor(boxes)) return images, annotations From d54ee2572a8f979298d2944d121e1be15a0ac6cc Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 21 Oct 2024 10:37:16 +0100 Subject: [PATCH 05/17] Add dummy dataset directories fixture --- tests/test_unit/test_datamodules.py | 56 +++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 15 deletions(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 7b2e5f7e..e94a47a9 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -5,6 +5,7 @@ import torch import torchvision.transforms.v2 as transforms import yaml # type: ignore +from torchvision.utils import save_image from crabs.detector.datamodules import CrabsDataModule @@ -24,17 +25,6 @@ def default_train_config(): return yaml.safe_load(f) -@pytest.fixture -def crabs_data_module_realistic(default_train_config): - return CrabsDataModule( - list_img_dirs=["dir1"], - list_annotation_files=["anno1"], - config=default_train_config, - split_seed=123, - no_data_augmentation=False, - ) - - @pytest.fixture def crabs_data_module_with_data_augm(default_train_config): return CrabsDataModule( @@ -140,6 +130,33 @@ def dummy_dataset(): return images, annotations +@pytest.fixture(scope="session") +def dummy_dataset_dirs(dummy_dataset, tmp_path_factory): + """Save dummy dataset to temporary directories and return their paths.""" + + # Get dummy data + images, annotations = dummy_dataset + + # Create temporary directories + frames_path = tmp_path_factory.mktemp("frames") + annotations_path = tmp_path_factory.mktemp("annotations") + + # Save images to temporary directory + for idx, img in enumerate(images): + out_path = frames_path / f"frame_{idx:04d}.png" + save_image(img, out_path) + + # Save annotations with expected format to temporary directory + + # return as dict + dataset_dict = { + "frames": frames_path, + "annotations": annotations_path, + } + + return dataset_dict + + @pytest.mark.parametrize( "crabs_data_module, expected_train_transforms", [ @@ -245,12 +262,21 @@ def test_collate_fn(crabs_data_module, dummy_dataset, request): def test_compute_splits( seed, expected_indices, - crabs_data_module_realistic, + dummy_dataset_dirs, + default_train_config, # ---- edit config too? ): """Test dataset splits are reproducible and according to the requested fraction""" - # Get transforms - dm = crabs_data_module_realistic + # Create datamodule + dm = CrabsDataModule( + list_img_dirs=[dummy_dataset_dirs["frames"]], + list_annotation_files=[dummy_dataset_dirs["annotations"]], + config=default_train_config, + split_seed=seed, + no_data_augmentation=False, + ) + + # Add transforms train_transform = dm._get_test_val_transform() test_and_val_transform = dm._get_test_val_transform() @@ -285,7 +311,7 @@ def test_compute_splits( ] # Check splits are non-overlapping in image IDs - # TODO: Can I improve this? it is v slow + # TODO: Can I improve this? it is v slow! assert ( len( set(image_ids_per_dataset["train"]) From 85484ba18f8ef688a875c2d01384ee9e80b3ac8b Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 21 Oct 2024 12:49:41 +0100 Subject: [PATCH 06/17] Add annotations file to fixture --- tests/test_unit/test_datamodules.py | 93 ++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 14 deletions(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index e94a47a9..021194b8 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -1,3 +1,4 @@ +import json import random from pathlib import Path @@ -99,13 +100,13 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip): return transform1_attrs_without_fns == transform2_attrs_without_fns -@pytest.fixture +@pytest.fixture(scope="module") def dummy_dataset(): """Create dummy images and annotations for testing. The dataset consists of 5 images, with a random number of bounding boxes per image. The bounding boxes have fixed width and height, but their location - is randomised. + is randomized. Both images and annotations are torch tensors. """ n_images = 5 img_size = 256 @@ -130,7 +131,7 @@ def dummy_dataset(): return images, annotations -@pytest.fixture(scope="session") +@pytest.fixture(scope="module") def dummy_dataset_dirs(dummy_dataset, tmp_path_factory): """Save dummy dataset to temporary directories and return their paths.""" @@ -138,15 +139,22 @@ def dummy_dataset_dirs(dummy_dataset, tmp_path_factory): images, annotations = dummy_dataset # Create temporary directories - frames_path = tmp_path_factory.mktemp("frames") - annotations_path = tmp_path_factory.mktemp("annotations") + frames_path = tmp_path_factory.mktemp("frames", numbered=False) + annotations_path = tmp_path_factory.mktemp("annotations", numbered=False) # Save images to temporary directory + list_img_filenames = [] for idx, img in enumerate(images): out_path = frames_path / f"frame_{idx:04d}.png" save_image(img, out_path) + list_img_filenames.append(out_path.name) # Save annotations with expected format to temporary directory + annotations_dict = bbox_tensors_to_COCO_dict( + annotations, list_img_filenames + ) + with open(annotations_path / "sample.json", "w") as f: + json.dump(annotations_dict, f) # return as dict dataset_dict = { @@ -157,6 +165,64 @@ def dummy_dataset_dirs(dummy_dataset, tmp_path_factory): return dataset_dict +def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None): + """Convert list of tensors with bounding boxes to COCO format + for a crab dataset. + + Parameters + ---------- + bbox_tensors : list[torch.Tensor] + List of tensors with bounding boxes for each image. + Each element of the list corresponds to an image, and each tensor in + the list contains the bounding boxes for that image. Each tensor is of + size (n, 4) where n is the number of bounding boxes in the image. + The 4 values in the second dimension are x_min, y_min, x_max, y_max. + + Returns + ------- + dict + COCO format dictionary with bounding boxes. + """ + # Create list of dictionaries for images + list_images = [] + for img_id, img_name in enumerate(list_img_filenames): + image_entry = { + "id": img_id + 1, # 1-based + "width": 0, + "height": 0, + "file_name": img_name, + } + list_images.append(image_entry) + + # Create list of dictionaries for annotations + list_annotations = [] + for img_id, img_bboxes in enumerate(bbox_tensors): + # loop thru bboxes in image + for bbox_row in img_bboxes: + x_min, y_min, x_max, y_max = bbox_row.numpy().tolist() + # we convert the array to list to make it JSON serializable + + annotation = { + "id": len(list_annotations) + 1, # 1-based + "image_id": img_id, + "bbox": [x_min, y_min, x_max - x_min, y_max - y_min], + "category_id": 1, + } + + list_annotations.append(annotation) + + # Create COCO dictionary + coco_dict = { + "info": {}, + "licenses": [], + "categories": [{"id": 1, "name": "crab", "supercategory": "animal"}], + "images": list_images, + "annotations": list_annotations, + } + + return coco_dict + + @pytest.mark.parametrize( "crabs_data_module, expected_train_transforms", [ @@ -263,9 +329,13 @@ def test_compute_splits( seed, expected_indices, dummy_dataset_dirs, - default_train_config, # ---- edit config too? + default_train_config, ): - """Test dataset splits are reproducible and according to the requested fraction""" + """Test dataset splits are reproducible and according to the requested + fraction""" + + # Edit config to change fraction according to parametrisation? + # ... # Create datamodule dm = CrabsDataModule( @@ -295,12 +365,6 @@ def test_compute_splits( assert len(val_dataset) / total_dataset_size == 0.1 # Check splits are non-overlapping in image IDs - # --- I cannot do this because samples are tuple(image, annotation) - # assert len(set(train_dataset) & set(test_dataset)) == 0 - # assert len(set(train_dataset) & set(val_dataset)) == 0 - # assert len(set(test_dataset) & set(val_dataset)) == 0 - # assert len(set(train_dataset) & set(test_dataset)) == 0 - # Compute lists of image IDs per dataset image_ids_per_dataset = {} for dataset, dataset_str in zip( @@ -312,6 +376,7 @@ def test_compute_splits( # Check splits are non-overlapping in image IDs # TODO: Can I improve this? it is v slow! + # maybe use indices, all referred to original dataset? assert ( len( set(image_ids_per_dataset["train"]) @@ -336,7 +401,7 @@ def test_compute_splits( # Check splits are reproducible # we check we always get the same indices from the dataset - # we input to `random_split` given the same seed + # that we input to `random_split`, given the same seed # Note that the indices are not the same as the image IDs! assert train_dataset.indices[:3] == expected_indices["train"] assert test_dataset.indices[:3] == expected_indices["test"] From e76896c914500afe0786c29d8c798c2e58a921be Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:08:00 +0100 Subject: [PATCH 07/17] Create fixture as factory of dummy datasets --- tests/test_unit/test_datamodules.py | 106 ++++++++++++++++------------ 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 021194b8..49246754 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -101,68 +101,74 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip): @pytest.fixture(scope="module") -def dummy_dataset(): - """Create dummy images and annotations for testing. +def create_dummy_dataset(): + """Return a factory of dummy images and annotations for testing. - The dataset consists of 5 images, with a random number of bounding boxes + The dataset consists of N images, with a random number of bounding boxes per image. The bounding boxes have fixed width and height, but their location is randomized. Both images and annotations are torch tensors. """ - n_images = 5 - img_size = 256 - fixed_width_height = 10 - - images = [torch.randn(3, img_size, img_size) for _ in range(n_images)] - annotations = [] - for _ in range(n_images): - # Generate random number of bounding boxes for each image - n_bboxes = random.randint(1, 5) - boxes = [] - for _ in range(n_bboxes): - # Randomise the location of the top left corner of the bounding box - x_min = random.randint(0, img_size - fixed_width_height) - y_min = random.randint(0, img_size - fixed_width_height) - - # Add fixed width and height to get the bottom right corner - x_max = x_min + fixed_width_height - y_max = y_min + fixed_width_height - boxes.append([x_min, y_min, x_max, y_max]) - annotations.append(torch.tensor(boxes)) - return images, annotations + + def _create_dummy_dataset(n_images): + # n_images = 10 # needs to be > 5 to avoid floating point errors in dataset split + img_size = 256 + fixed_width_height = 10 + + images = [torch.randn(3, img_size, img_size) for _ in range(n_images)] + annotations = [] + for _ in range(n_images): + # Generate random number of bounding boxes for each image + n_bboxes = random.randint(1, 5) + boxes = [] + for _ in range(n_bboxes): + # Randomise the location of the top left corner of the bounding box + x_min = random.randint(0, img_size - fixed_width_height) + y_min = random.randint(0, img_size - fixed_width_height) + + # Add fixed width and height to get the bottom right corner + x_max = x_min + fixed_width_height + y_max = y_min + fixed_width_height + boxes.append([x_min, y_min, x_max, y_max]) + annotations.append(torch.tensor(boxes)) + return images, annotations + + return _create_dummy_dataset # return function handle! @pytest.fixture(scope="module") -def dummy_dataset_dirs(dummy_dataset, tmp_path_factory): - """Save dummy dataset to temporary directories and return their paths.""" +def dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory): + """Return a dictionary with dataset paths for testing. + + The dataset corresponds to a 50-image dataset with dummy annotations + in COCO format. + """ # Get dummy data - images, annotations = dummy_dataset + images, annotations = create_dummy_dataset(n_images=50) # Create temporary directories - frames_path = tmp_path_factory.mktemp("frames", numbered=False) - annotations_path = tmp_path_factory.mktemp("annotations", numbered=False) + frames_dir = tmp_path_factory.mktemp("frames", numbered=False) + annotations_dir = tmp_path_factory.mktemp("annotations", numbered=False) + annotations_file_path = annotations_dir / "sample.json" # Save images to temporary directory - list_img_filenames = [] for idx, img in enumerate(images): - out_path = frames_path / f"frame_{idx:04d}.png" + out_path = frames_dir / f"frame_{idx:04d}.png" save_image(img, out_path) - list_img_filenames.append(out_path.name) - # Save annotations with expected format to temporary directory - annotations_dict = bbox_tensors_to_COCO_dict( - annotations, list_img_filenames - ) - with open(annotations_path / "sample.json", "w") as f: - json.dump(annotations_dict, f) + # Save annotations file with expected format to temporary directory + annotations_dict = bbox_tensors_to_COCO_dict(annotations) + + with open(annotations_file_path, "w") as f: + json.dump(annotations_dict, f, indent=4) # pretty print - # return as dict - dataset_dict = { - "frames": frames_path, - "annotations": annotations_path, + # Return paths as dict + dataset_paths = { + "frames": frames_dir, + "annotations": annotations_file_path, } - return dataset_dict + return dataset_paths def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None): @@ -177,12 +183,22 @@ def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None): the list contains the bounding boxes for that image. Each tensor is of size (n, 4) where n is the number of bounding boxes in the image. The 4 values in the second dimension are x_min, y_min, x_max, y_max. + list_img_filenames : list[str], optional + List of image filenames. If not provided, filenames are generated + as "frame_{i:04d}.png" where i is the 0-based index of the image in the + list of bounding boxes. Returns ------- dict COCO format dictionary with bounding boxes. """ + # Create list of image filenames if not provided + if list_img_filenames is None: + list_img_filenames = [ + f"frame_{i:04d}.png" for i in range(len(bbox_tensors)) + ] + # Create list of dictionaries for images list_images = [] for img_id, img_name in enumerate(list_img_filenames): @@ -205,8 +221,10 @@ def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None): annotation = { "id": len(list_annotations) + 1, # 1-based "image_id": img_id, - "bbox": [x_min, y_min, x_max - x_min, y_max - y_min], "category_id": 1, + "bbox": [x_min, y_min, x_max - x_min, y_max - y_min], + "area": (x_max - x_min) * (y_max - y_min), + "iscrowd": 0, } list_annotations.append(annotation) From ce3a3f87b6327a9cd148b568d48eb08d5a1eca81 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:08:35 +0100 Subject: [PATCH 08/17] Tests pass --- tests/test_unit/test_datamodules.py | 34 ++++++++++++++++++----------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 49246754..1aef0806 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -2,6 +2,7 @@ import random from pathlib import Path +import numpy as np import pytest import torch import torchvision.transforms.v2 as transforms @@ -319,8 +320,10 @@ def test_get_test_val_transform( "crabs_data_module_without_data_augm", ], ) -def test_collate_fn(crabs_data_module, dummy_dataset, request): +def test_collate_fn(crabs_data_module, create_dummy_dataset, request): crabs_data_module = request.getfixturevalue(crabs_data_module) + + dummy_dataset = create_dummy_dataset(n_images=5) collated_data = crabs_data_module._collate_fn(dummy_dataset) assert len(collated_data) == len(dummy_dataset[0]) # images @@ -339,8 +342,8 @@ def test_collate_fn(crabs_data_module, dummy_dataset, request): @pytest.mark.parametrize( "seed, expected_indices", [ - (123, {"train": [1, 2, 3], "test": [1, 2, 3], "val": [1, 2, 3]}), - (42, {"train": [1, 2, 3], "test": [1, 2, 3], "val": [1, 2, 3]}), + (123, {"train": [32, 30, 0], "test": [4, 6, 2], "val": [7, 1, 8]}), + (42, {"train": [42, 17, 30], "test": [6, 4, 0], "val": [8, 3, 2]}), ], ) def test_compute_splits( @@ -354,6 +357,8 @@ def test_compute_splits( # Edit config to change fraction according to parametrisation? # ... + # TODO: test different dataset sizes + # TODO: test different fractions # Create datamodule dm = CrabsDataModule( @@ -372,15 +377,18 @@ def test_compute_splits( train_dataset, _, _ = dm._compute_splits(train_transform) _, test_dataset, val_dataset = dm._compute_splits(test_and_val_transform) - # Check split sizes are as expected + # Check total size of dataset total_dataset_size = ( len(train_dataset) + len(test_dataset) + len(val_dataset) ) - assert total_dataset_size == 50 - # TODO: change to np.isclose - assert len(train_dataset) / total_dataset_size == 0.8 - assert len(test_dataset) / total_dataset_size == 0.1 - assert len(val_dataset) / total_dataset_size == 0.1 + n_frame_files = len(list(dummy_dataset_dirs["frames"].glob("*.png"))) + + assert total_dataset_size == n_frame_files + + # Check split sizes are as expected + assert np.isclose(len(train_dataset) / total_dataset_size, 0.8, atol=0.05) + assert np.isclose(len(test_dataset) / total_dataset_size, 0.1, atol=0.05) + assert np.isclose(len(val_dataset) / total_dataset_size, 0.1, atol=0.05) # Check splits are non-overlapping in image IDs # Compute lists of image IDs per dataset @@ -392,7 +400,6 @@ def test_compute_splits( sample[1]["image_id"] for sample in dataset ] - # Check splits are non-overlapping in image IDs # TODO: Can I improve this? it is v slow! # maybe use indices, all referred to original dataset? assert ( @@ -417,9 +424,10 @@ def test_compute_splits( == 0 ) - # Check splits are reproducible - # we check we always get the same indices from the dataset - # that we input to `random_split`, given the same seed + # Check splits are reproducible. + # We check that given the same seed, we always get the + # same indices. The indices refer to the input dataset to + # `random_split`. # Note that the indices are not the same as the image IDs! assert train_dataset.indices[:3] == expected_indices["train"] assert test_dataset.indices[:3] == expected_indices["test"] From 09ac68112aebc537679778a5c35cce7c42e0a100 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:47:33 +0100 Subject: [PATCH 09/17] Make fixture factory of dataset dirs --- tests/test_unit/test_datamodules.py | 49 +++++++++++++++-------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 1aef0806..9b5d30e6 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -136,40 +136,43 @@ def _create_dummy_dataset(n_images): return _create_dummy_dataset # return function handle! -@pytest.fixture(scope="module") -def dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory): +@pytest.fixture() +def create_dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory): """Return a dictionary with dataset paths for testing. - The dataset corresponds to a 50-image dataset with dummy annotations + The dataset points to an N-image dataset with dummy annotations in COCO format. """ - # Get dummy data - images, annotations = create_dummy_dataset(n_images=50) + def _create_dummy_dataset_dirs(n_images): + # Get dummy data + images, annotations = create_dummy_dataset(n_images) - # Create temporary directories - frames_dir = tmp_path_factory.mktemp("frames", numbered=False) - annotations_dir = tmp_path_factory.mktemp("annotations", numbered=False) - annotations_file_path = annotations_dir / "sample.json" + # Create temporary directories + frames_dir = tmp_path_factory.mktemp("frames") + annotations_dir = tmp_path_factory.mktemp("annotations") + annotations_file_path = annotations_dir / "sample.json" - # Save images to temporary directory - for idx, img in enumerate(images): - out_path = frames_dir / f"frame_{idx:04d}.png" - save_image(img, out_path) + # Save images to temporary directory + for idx, img in enumerate(images): + out_path = frames_dir / f"frame_{idx:04d}.png" + save_image(img, out_path) - # Save annotations file with expected format to temporary directory - annotations_dict = bbox_tensors_to_COCO_dict(annotations) + # Save annotations file with expected format to temporary directory + annotations_dict = bbox_tensors_to_COCO_dict(annotations) - with open(annotations_file_path, "w") as f: - json.dump(annotations_dict, f, indent=4) # pretty print + with open(annotations_file_path, "w") as f: + json.dump(annotations_dict, f, indent=4) # pretty print - # Return paths as dict - dataset_paths = { - "frames": frames_dir, - "annotations": annotations_file_path, - } + # Return paths as dict + dataset_paths = { + "frames": frames_dir, + "annotations": annotations_file_path, + } + + return dataset_paths - return dataset_paths + return _create_dummy_dataset_dirs # return function handle! def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None): From 215e3663f1e67f754d58a936ab806eb94246c776 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:48:05 +0100 Subject: [PATCH 10/17] Expand test parametrisation --- tests/test_unit/test_datamodules.py | 99 ++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 31 deletions(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 9b5d30e6..0da73564 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -101,7 +101,7 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip): return transform1_attrs_without_fns == transform2_attrs_without_fns -@pytest.fixture(scope="module") +@pytest.fixture() def create_dummy_dataset(): """Return a factory of dummy images and annotations for testing. @@ -343,40 +343,66 @@ def test_collate_fn(crabs_data_module, create_dummy_dataset, request): @pytest.mark.parametrize( - "seed, expected_indices", + "dataset_size, seed, train_fraction, val_over_test_fraction, expected_img_ids_per_split", [ - (123, {"train": [32, 30, 0], "test": [4, 6, 2], "val": [7, 1, 8]}), - (42, {"train": [42, 17, 30], "test": [6, 4, 0], "val": [8, 3, 2]}), + ( + 50, + 123, + 0.8, + 0.5, + {"train": [33, 31, 1], "test": [21, 44, 41], "val": [36, 40, 27]}, + ), + ( + 100, + 42, + 0.6, + 0.5, + {"train": [43, 97, 63], "test": [9, 66, 1], "val": [73, 91, 86]}, + ), + ( + 250, + 37, + 0.6, + 0.25, + { + "train": [32, 50, 119], + "test": [107, 9, 68], + "val": [199, 180, 168], + }, + ), ], ) def test_compute_splits( + dataset_size, seed, - expected_indices, - dummy_dataset_dirs, + train_fraction, + val_over_test_fraction, + expected_img_ids_per_split, + create_dummy_dataset_dirs, default_train_config, ): """Test dataset splits are reproducible and according to the requested fraction""" - # Edit config to change fraction according to parametrisation? - # ... - # TODO: test different dataset sizes - # TODO: test different fractions + # Create a dummy dataset and get directories + dataset_dirs = create_dummy_dataset_dirs(n_images=dataset_size) + + # Edit config to change splits' fractions + default_train_config["train_fraction"] = train_fraction + default_train_config["val_over_test_fraction"] = val_over_test_fraction # Create datamodule dm = CrabsDataModule( - list_img_dirs=[dummy_dataset_dirs["frames"]], - list_annotation_files=[dummy_dataset_dirs["annotations"]], + list_img_dirs=[dataset_dirs["frames"]], + list_annotation_files=[dataset_dirs["annotations"]], config=default_train_config, split_seed=seed, no_data_augmentation=False, ) - # Add transforms + # Compute splits train_transform = dm._get_test_val_transform() test_and_val_transform = dm._get_test_val_transform() - - # Compute splits train_dataset, _, _ = dm._compute_splits(train_transform) _, test_dataset, val_dataset = dm._compute_splits(test_and_val_transform) @@ -384,14 +410,23 @@ def test_compute_splits( total_dataset_size = ( len(train_dataset) + len(test_dataset) + len(val_dataset) ) - n_frame_files = len(list(dummy_dataset_dirs["frames"].glob("*.png"))) - + n_frame_files = len(list(dataset_dirs["frames"].glob("*.png"))) assert total_dataset_size == n_frame_files - # Check split sizes are as expected - assert np.isclose(len(train_dataset) / total_dataset_size, 0.8, atol=0.05) - assert np.isclose(len(test_dataset) / total_dataset_size, 0.1, atol=0.05) - assert np.isclose(len(val_dataset) / total_dataset_size, 0.1, atol=0.05) + # Check split sizes match requested fractions + assert np.isclose( + len(train_dataset) / total_dataset_size, train_fraction, atol=0.05 + ) + assert np.isclose( + len(test_dataset) / total_dataset_size, + (1.0 - train_fraction) * (1.0 - val_over_test_fraction), + atol=0.05, + ) + assert np.isclose( + len(val_dataset) / total_dataset_size, + (1.0 - train_fraction) * val_over_test_fraction, + atol=0.05, + ) # Check splits are non-overlapping in image IDs # Compute lists of image IDs per dataset @@ -403,8 +438,6 @@ def test_compute_splits( sample[1]["image_id"] for sample in dataset ] - # TODO: Can I improve this? it is v slow! - # maybe use indices, all referred to original dataset? assert ( len( set(image_ids_per_dataset["train"]) @@ -427,11 +460,15 @@ def test_compute_splits( == 0 ) - # Check splits are reproducible. - # We check that given the same seed, we always get the - # same indices. The indices refer to the input dataset to - # `random_split`. - # Note that the indices are not the same as the image IDs! - assert train_dataset.indices[:3] == expected_indices["train"] - assert test_dataset.indices[:3] == expected_indices["test"] - assert val_dataset.indices[:3] == expected_indices["val"] + # Check dataset creation is reproducible by checking + # the first 3 image IDs are as expected + assert ( + image_ids_per_dataset["train"][:3] + == expected_img_ids_per_split["train"] + ) + assert ( + image_ids_per_dataset["test"][:3] == expected_img_ids_per_split["test"] + ) + assert ( + image_ids_per_dataset["val"][:3] == expected_img_ids_per_split["val"] + ) From 11f481c9f8230f36660da1169d6bf6ff056c062b Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 21 Oct 2024 16:52:37 +0100 Subject: [PATCH 11/17] Small edits to docstrings --- tests/test_unit/test_datamodules.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 0da73564..c6c5b552 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -138,7 +138,7 @@ def _create_dummy_dataset(n_images): @pytest.fixture() def create_dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory): - """Return a dictionary with dataset paths for testing. + """Return a factory of dictionaries with dataset paths for testing. The dataset points to an N-image dataset with dummy annotations in COCO format. @@ -324,22 +324,23 @@ def test_get_test_val_transform( ], ) def test_collate_fn(crabs_data_module, create_dummy_dataset, request): + """Test collate function formats the dataset as expected.""" crabs_data_module = request.getfixturevalue(crabs_data_module) - dummy_dataset = create_dummy_dataset(n_images=5) - collated_data = crabs_data_module._collate_fn(dummy_dataset) + dataset = create_dummy_dataset(n_images=5) + collated_data = crabs_data_module._collate_fn(dataset) - assert len(collated_data) == len(dummy_dataset[0]) # images - assert len(collated_data) == len(dummy_dataset[1]) # annotations + assert len(collated_data) == len(dataset[0]) # images + assert len(collated_data) == len(dataset[1]) # annotations for i, sample in enumerate(collated_data): - # check length + # check length is 2 -> (image, annotation) assert len(sample) == 2 - # check same content as in dummy dataset + # check content is the same as in input dataset image, annotation = sample - assert torch.equal(image, dummy_dataset[0][i]) - assert torch.equal(annotation, dummy_dataset[1][i]) + assert torch.equal(image, dataset[0][i]) + assert torch.equal(annotation, dataset[1][i]) @pytest.mark.parametrize( From 870b524145b58e7b07681c2ef807975596773a0f Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 23 Oct 2024 18:51:33 +0100 Subject: [PATCH 12/17] Move bbox_tensors_to_COCO_dict to utils --- crabs/detector/utils/detection.py | 74 ++++++++++++++++++++++++++- tests/test_unit/test_datamodules.py | 79 ++--------------------------- 2 files changed, 78 insertions(+), 75 deletions(-) diff --git a/crabs/detector/utils/detection.py b/crabs/detector/utils/detection.py index b2f82ba6..df4de718 100644 --- a/crabs/detector/utils/detection.py +++ b/crabs/detector/utils/detection.py @@ -4,8 +4,9 @@ import datetime import os from pathlib import Path -from typing import Any +from typing import Any, Optional +import torch from lightning.pytorch.loggers import MLFlowLogger DEFAULT_ANNOTATIONS_FILENAME = "VIA_JSON_combined_coco_gen.json" @@ -242,3 +243,74 @@ def slurm_logs_as_artifacts(logger: MLFlowLogger, slurm_job_id: str): logger.run_id, f"{log_filename}.{ext}", ) + + +def bbox_tensors_to_COCO_dict( + bbox_tensors: torch.Tensor, list_img_filenames: Optional[list] = None +) -> dict: + """Convert list of bounding boxes as tensors to COCO-crab format. + + Parameters + ---------- + bbox_tensors : list[torch.Tensor] + List of tensors with bounding boxes for each image. + Each element of the list corresponds to an image, and each tensor in + the list contains the bounding boxes for that image. Each tensor is of + size (n, 4) where n is the number of bounding boxes in the image. + The 4 values in the second dimension are x_min, y_min, x_max, y_max. + list_img_filenames : list[str], optional + List of image filenames. If not provided, filenames are generated + as "frame_{i:04d}.png" where i is the 0-based index of the image in the + list of bounding boxes. + + Returns + ------- + dict + COCO format dictionary with bounding boxes. + """ + # Create list of image filenames if not provided + if list_img_filenames is None: + list_img_filenames = [ + f"frame_{i:04d}.png" for i in range(len(bbox_tensors)) + ] + + # Create list of dictionaries for images + list_images: list[dict] = [] + for img_id, img_name in enumerate(list_img_filenames): + image_entry = { + "id": img_id + 1, # 1-based + "width": 0, + "height": 0, + "file_name": img_name, + } + list_images.append(image_entry) + + # Create list of dictionaries for annotations + list_annotations: list[dict] = [] + for img_id, img_bboxes in enumerate(bbox_tensors): + # loop thru bboxes in image + for bbox_row in img_bboxes: + x_min, y_min, x_max, y_max = bbox_row.numpy().tolist() + # we convert the array to list to make it JSON serializable + + annotation = { + "id": len(list_annotations) + 1, # 1-based + "image_id": img_id, + "category_id": 1, + "bbox": [x_min, y_min, x_max - x_min, y_max - y_min], + "area": (x_max - x_min) * (y_max - y_min), + "iscrowd": 0, + } + + list_annotations.append(annotation) + + # Create COCO dictionary + coco_dict = { + "info": {}, + "licenses": [], + "categories": [{"id": 1, "name": "crab", "supercategory": "animal"}], + "images": list_images, + "annotations": list_annotations, + } + + return coco_dict diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index c6c5b552..79abbf6d 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -10,6 +10,7 @@ from torchvision.utils import save_image from crabs.detector.datamodules import CrabsDataModule +from crabs.detector.utils.detection import bbox_tensors_to_COCO_dict DEFAULT_CONFIG = ( Path(__file__).parents[2] @@ -105,7 +106,7 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip): def create_dummy_dataset(): """Return a factory of dummy images and annotations for testing. - The dataset consists of N images, with a random number of bounding boxes + The created datasets consist of N images, with a random number of bounding boxes per image. The bounding boxes have fixed width and height, but their location is randomized. Both images and annotations are torch tensors. """ @@ -133,14 +134,14 @@ def _create_dummy_dataset(n_images): annotations.append(torch.tensor(boxes)) return images, annotations - return _create_dummy_dataset # return function handle! + return _create_dummy_dataset @pytest.fixture() def create_dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory): """Return a factory of dictionaries with dataset paths for testing. - The dataset points to an N-image dataset with dummy annotations + The linked datasets are N-images datasets with dummy annotations in COCO format. """ @@ -172,77 +173,7 @@ def _create_dummy_dataset_dirs(n_images): return dataset_paths - return _create_dummy_dataset_dirs # return function handle! - - -def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None): - """Convert list of tensors with bounding boxes to COCO format - for a crab dataset. - - Parameters - ---------- - bbox_tensors : list[torch.Tensor] - List of tensors with bounding boxes for each image. - Each element of the list corresponds to an image, and each tensor in - the list contains the bounding boxes for that image. Each tensor is of - size (n, 4) where n is the number of bounding boxes in the image. - The 4 values in the second dimension are x_min, y_min, x_max, y_max. - list_img_filenames : list[str], optional - List of image filenames. If not provided, filenames are generated - as "frame_{i:04d}.png" where i is the 0-based index of the image in the - list of bounding boxes. - - Returns - ------- - dict - COCO format dictionary with bounding boxes. - """ - # Create list of image filenames if not provided - if list_img_filenames is None: - list_img_filenames = [ - f"frame_{i:04d}.png" for i in range(len(bbox_tensors)) - ] - - # Create list of dictionaries for images - list_images = [] - for img_id, img_name in enumerate(list_img_filenames): - image_entry = { - "id": img_id + 1, # 1-based - "width": 0, - "height": 0, - "file_name": img_name, - } - list_images.append(image_entry) - - # Create list of dictionaries for annotations - list_annotations = [] - for img_id, img_bboxes in enumerate(bbox_tensors): - # loop thru bboxes in image - for bbox_row in img_bboxes: - x_min, y_min, x_max, y_max = bbox_row.numpy().tolist() - # we convert the array to list to make it JSON serializable - - annotation = { - "id": len(list_annotations) + 1, # 1-based - "image_id": img_id, - "category_id": 1, - "bbox": [x_min, y_min, x_max - x_min, y_max - y_min], - "area": (x_max - x_min) * (y_max - y_min), - "iscrowd": 0, - } - - list_annotations.append(annotation) - - # Create COCO dictionary - coco_dict = { - "info": {}, - "licenses": [], - "categories": [{"id": 1, "name": "crab", "supercategory": "animal"}], - "images": list_images, - "annotations": list_annotations, - } - - return coco_dict + return _create_dummy_dataset_dirs @pytest.mark.parametrize( From 833623673f47be6cb41fb17fa12a9478aee89d2f Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 23 Oct 2024 18:58:18 +0100 Subject: [PATCH 13/17] Clarify comment --- tests/test_unit/test_datamodules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 79abbf6d..4f9ca86d 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -316,7 +316,7 @@ def test_compute_splits( """Test dataset splits are reproducible and according to the requested fraction""" - # Create a dummy dataset and get directories + # Create a dummy dataset and get paths to its directories dataset_dirs = create_dummy_dataset_dirs(n_images=dataset_size) # Edit config to change splits' fractions From 07a752ab8aa3fba869accbe93bf0c68e869af260 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Wed, 23 Oct 2024 18:59:53 +0100 Subject: [PATCH 14/17] Restore pre-commit config --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5632813f..e16d66da 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -30,8 +30,8 @@ repos: additional_dependencies: - types-setuptools - repo: https://github.com/mgedmin/check-manifest - rev: "0.50" + rev: "0.49" hooks: - id: check-manifest args: [--no-build-isolation] - additional_dependencies: [setuptools-scm, wheel] + additional_dependencies: [setuptools-scm] From c678df6c57d7e1e695688d6246035550120d81b5 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 24 Oct 2024 17:02:43 +0100 Subject: [PATCH 15/17] Rename generators --- crabs/detector/datamodules.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/crabs/detector/datamodules.py b/crabs/detector/datamodules.py index 86e2cfec..2be2744f 100644 --- a/crabs/detector/datamodules.py +++ b/crabs/detector/datamodules.py @@ -169,11 +169,12 @@ def _compute_splits( A tuple with the train, test and validation datasets """ - # Optionally fix the generator for a reproducible split of data - generator_1, generator_2 = None, None + # Optionally fix the random number generators for reproducible + # splits of data + rng_train_split, rng_val_split = None, None if self.split_seed: - generator_1 = torch.Generator().manual_seed(self.split_seed) - generator_2 = torch.Generator().manual_seed(self.split_seed) + rng_train_split = torch.Generator().manual_seed(self.split_seed) + rng_val_split = torch.Generator().manual_seed(self.split_seed) # Create dataset (combining all datasets passed) full_dataset = CrabsCocoDetection( @@ -190,7 +191,7 @@ def _compute_splits( train_dataset, test_val_dataset = random_split( full_dataset, [self.config["train_fraction"], 1 - self.config["train_fraction"]], - generator=generator_1, + generator=rng_train_split, ) # Split test/val sets from the remainder @@ -200,7 +201,7 @@ def _compute_splits( 1 - self.config["val_over_test_fraction"], self.config["val_over_test_fraction"], ], - generator=generator_2, + generator=rng_val_split, ) return train_dataset, test_dataset, val_dataset From 206934f986e57c6c788c5a9b4b9736c28eca7005 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 28 Oct 2024 15:21:43 +0000 Subject: [PATCH 16/17] Fix image ID default indexing (it is 1-based in VIA tool) --- crabs/detector/utils/detection.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crabs/detector/utils/detection.py b/crabs/detector/utils/detection.py index df4de718..dd51bda3 100644 --- a/crabs/detector/utils/detection.py +++ b/crabs/detector/utils/detection.py @@ -294,8 +294,9 @@ def bbox_tensors_to_COCO_dict( # we convert the array to list to make it JSON serializable annotation = { - "id": len(list_annotations) + 1, # 1-based - "image_id": img_id, + "id": len(list_annotations) + + 1, # 1-based by default in VIA tool + "image_id": img_id + 1, # 1-based by default in VIA tool "category_id": 1, "bbox": [x_min, y_min, x_max - x_min, y_max - y_min], "area": (x_max - x_min) * (y_max - y_min), From fb1ad0649bd19af5b3461407f64a08d190f496f0 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Mon, 28 Oct 2024 15:24:32 +0000 Subject: [PATCH 17/17] Remove n=10 comment in the dummy dataset fixture --- tests/test_unit/test_datamodules.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py index 4f9ca86d..2a823605 100644 --- a/tests/test_unit/test_datamodules.py +++ b/tests/test_unit/test_datamodules.py @@ -112,7 +112,11 @@ def create_dummy_dataset(): """ def _create_dummy_dataset(n_images): - # n_images = 10 # needs to be > 5 to avoid floating point errors in dataset split + """Create a dataset with N images and random bounding boxes per image. + + The number of images in the dataset needs to be > 5 to avoid floating point errors + in the dataset split. + """ img_size = 256 fixed_width_height = 10