From 52fe50de105a111d8f6f019aa26e900009fc64a6 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 8 Jul 2024 17:03:39 +0100
Subject: [PATCH 01/17] Fix validation and test split not being reproducible

---
 crabs/detector/datamodules.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/crabs/detector/datamodules.py b/crabs/detector/datamodules.py
index 1e6d18a4..30505efb 100644
--- a/crabs/detector/datamodules.py
+++ b/crabs/detector/datamodules.py
@@ -170,9 +170,10 @@ def _compute_splits(
         """
 
         # Optionally fix the generator for a reproducible split of data
-        generator = None
+        generator_1, generator_2 = None, None
         if self.split_seed:
-            generator = torch.Generator().manual_seed(self.split_seed)
+            generator_1 = torch.Generator().manual_seed(self.split_seed)
+            generator_2 = torch.Generator().manual_seed(self.split_seed)
 
         # Create dataset (combining all datasets passed)
         full_dataset = CrabsCocoDetection(
@@ -189,7 +190,7 @@ def _compute_splits(
         train_dataset, test_val_dataset = random_split(
             full_dataset,
             [self.config["train_fraction"], 1 - self.config["train_fraction"]],
-            generator=generator,
+            generator=generator_1,
         )
 
         # Split test/val sets from the remainder
@@ -199,6 +200,7 @@ def _compute_splits(
                 1 - self.config["val_over_test_fraction"],
                 self.config["val_over_test_fraction"],
             ],
+            generator=generator_2,
         )
 
         return train_dataset, test_dataset, val_dataset

From 0b2c7b2711f9bd75e25ea921a78b8dd4738cc1ed Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 18 Oct 2024 20:38:40 +0200
Subject: [PATCH 02/17] Check-manifest fix

---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e16d66da..5632813f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,8 +30,8 @@ repos:
         additional_dependencies:
           - types-setuptools
   - repo: https://github.com/mgedmin/check-manifest
-    rev: "0.49"
+    rev: "0.50"
     hooks:
       - id: check-manifest
         args: [--no-build-isolation]
-        additional_dependencies: [setuptools-scm]
+        additional_dependencies: [setuptools-scm, wheel]

From 26f2b751d1a032bfc724cf63cab6b5f836baeb33 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 18 Oct 2024 20:38:58 +0200
Subject: [PATCH 03/17] Draft dataset split tests

---
 crabs/detector/datamodules.py       |   2 +-
 tests/test_unit/test_datamodules.py | 181 +++++++++++++++++++++-------
 2 files changed, 139 insertions(+), 44 deletions(-)

diff --git a/crabs/detector/datamodules.py b/crabs/detector/datamodules.py
index 30505efb..86e2cfec 100644
--- a/crabs/detector/datamodules.py
+++ b/crabs/detector/datamodules.py
@@ -218,9 +218,9 @@ def setup(self, stage: str):
         Define the transforms for each split of the data and compute them.
         """
         # Assign transforms
+        self.train_transform = self._get_train_transform()
         # right now assuming validation and test get the same transforms
         test_and_val_transform = self._get_test_val_transform()
-        self.train_transform = self._get_train_transform()
         self.test_transform = test_and_val_transform
         self.val_transform = test_and_val_transform
 
diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index c5ec4080..9e2c40dd 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -24,6 +24,39 @@ def default_train_config():
         return yaml.safe_load(f)
 
 
+@pytest.fixture
+def crabs_data_module_realistic(default_train_config):
+    return CrabsDataModule(
+        list_img_dirs=["dir1"],
+        list_annotation_files=["anno1"],
+        config=default_train_config,
+        split_seed=123,
+        no_data_augmentation=False,
+    )
+
+
+@pytest.fixture
+def crabs_data_module_with_data_augm(default_train_config):
+    return CrabsDataModule(
+        list_img_dirs=["dir1", "dir2"],
+        list_annotation_files=["anno1", "anno2"],
+        config=default_train_config,
+        split_seed=123,
+        no_data_augmentation=False,
+    )
+
+
+@pytest.fixture
+def crabs_data_module_without_data_augm(default_train_config):
+    return CrabsDataModule(
+        list_img_dirs=["dir1", "dir2"],
+        list_annotation_files=["anno1", "anno2"],
+        config=default_train_config,
+        split_seed=123,
+        no_data_augmentation=True,
+    )
+
+
 @pytest.fixture
 def expected_data_augm_transforms():
     return transforms.Compose(
@@ -58,28 +91,6 @@ def expected_no_data_augm_transforms():
     )
 
 
-@pytest.fixture
-def crabs_data_module_with_data_augm(default_train_config):
-    return CrabsDataModule(
-        list_img_dirs=["dir1", "dir2"],
-        list_annotation_files=["anno1", "anno2"],
-        config=default_train_config,
-        split_seed=123,
-        no_data_augmentation=False,
-    )
-
-
-@pytest.fixture
-def crabs_data_module_without_data_augm(default_train_config):
-    return CrabsDataModule(
-        list_img_dirs=["dir1", "dir2"],
-        list_annotation_files=["anno1", "anno2"],
-        config=default_train_config,
-        split_seed=123,
-        no_data_augmentation=True,
-    )
-
-
 def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip):
     """Compare the attributes of two transforms excluding those in list."""
 
@@ -98,6 +109,27 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip):
     return transform1_attrs_without_fns == transform2_attrs_without_fns
 
 
+@pytest.fixture
+def dummy_dataset():
+    """Create dummy images and annotations for testing."""
+    num_samples = 5
+    images = [torch.randn(3, 256, 256) for _ in range(num_samples)]
+    annotations = []
+    for _ in range(num_samples):
+        # Generate random number of bounding boxes for each image
+        num_boxes = random.randint(1, 5)
+        boxes = []
+        for _ in range(num_boxes):
+            # Generate random bounding box coordinates within image size
+            x_min = random.randint(0, 200)
+            y_min = random.randint(0, 200)
+            x_max = random.randint(x_min + 10, 256)
+            y_max = random.randint(y_min + 10, 256)
+            boxes.append([x_min, y_min, x_max, y_max])
+        annotations.append(torch.tensor(boxes))
+    return images, annotations
+
+
 @pytest.mark.parametrize(
     "crabs_data_module, expected_train_transforms",
     [
@@ -111,6 +143,7 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip):
 def test_get_train_transform(
     crabs_data_module, expected_train_transforms, request
 ):
+    """Test transforms linked to training set are as expected"""
     crabs_data_module = request.getfixturevalue(crabs_data_module)
     expected_train_transforms = request.getfixturevalue(
         expected_train_transforms
@@ -150,6 +183,7 @@ def test_get_train_transform(
 def test_get_test_val_transform(
     crabs_data_module, expected_test_val_transforms, request
 ):
+    """Test transforms linked to test and validation sets are as expected"""
     crabs_data_module = request.getfixturevalue(crabs_data_module)
     expected_test_val_transforms = request.getfixturevalue(
         expected_test_val_transforms
@@ -167,27 +201,6 @@ def test_get_test_val_transform(
         assert test_val_tr.__dict__ == expected_test_val_tr.__dict__
 
 
-@pytest.fixture
-def dummy_dataset():
-    """Create dummy images and annotations for testing."""
-    num_samples = 5
-    images = [torch.randn(3, 256, 256) for _ in range(num_samples)]
-    annotations = []
-    for _ in range(num_samples):
-        # Generate random number of bounding boxes for each image
-        num_boxes = random.randint(1, 5)
-        boxes = []
-        for _ in range(num_boxes):
-            # Generate random bounding box coordinates within image size
-            x_min = random.randint(0, 200)
-            y_min = random.randint(0, 200)
-            x_max = random.randint(x_min + 10, 256)
-            y_max = random.randint(y_min + 10, 256)
-            boxes.append([x_min, y_min, x_max, y_max])
-        annotations.append(torch.tensor(boxes))
-    return images, annotations
-
-
 @pytest.mark.parametrize(
     "crabs_data_module",
     [
@@ -210,3 +223,85 @@ def test_collate_fn(crabs_data_module, dummy_dataset, request):
         image, annotation = sample
         assert torch.equal(image, dummy_dataset[0][i])
         assert torch.equal(annotation, dummy_dataset[1][i])
+
+
+@pytest.mark.parametrize(
+    "seed, expected_indices",
+    [
+        (123, {"train": [1, 2, 3], "test": [1, 2, 3], "val": [1, 2, 3]}),
+        (42, {"train": [1, 2, 3], "test": [1, 2, 3], "val": [1, 2, 3]}),
+    ],
+)
+def test_compute_splits(
+    seed,
+    expected_indices,
+    crabs_data_module_realistic,
+):
+    """Test dataset splits are reproducible and according to the requested fraction"""
+
+    # Get transforms
+    dm = crabs_data_module_realistic
+    train_transform = dm._get_test_val_transform()
+    test_and_val_transform = dm._get_test_val_transform()
+
+    # Compute splits
+    train_dataset, _, _ = dm._compute_splits(train_transform)
+    _, test_dataset, val_dataset = dm._compute_splits(test_and_val_transform)
+
+    # Check split sizes are as expected
+    total_dataset_size = (
+        len(train_dataset) + len(test_dataset) + len(val_dataset)
+    )
+    assert total_dataset_size == 50
+    # TODO: change to np.isclose
+    assert len(train_dataset) / total_dataset_size == 0.8
+    assert len(test_dataset) / total_dataset_size == 0.1
+    assert len(val_dataset) / total_dataset_size == 0.1
+
+    # Check splits are non-overlapping in image IDs
+    # --- I cannot do this because samples are tuple(image, annotation)
+    # assert len(set(train_dataset) & set(test_dataset)) == 0
+    # assert len(set(train_dataset) & set(val_dataset)) == 0
+    # assert len(set(test_dataset) & set(val_dataset)) == 0
+    # assert len(set(train_dataset) & set(test_dataset)) == 0
+
+    # Compute lists of image IDs per dataset
+    image_ids_per_dataset = {}
+    for dataset, dataset_str in zip(
+        [train_dataset, test_dataset, val_dataset], ["train", "test", "val"]
+    ):
+        image_ids_per_dataset[dataset_str] = [
+            sample[1]["image_id"] for sample in dataset
+        ]
+
+    # Check splits are non-overlapping in image IDs
+    # TODO: Can I improve this? it is v slow
+    assert (
+        len(
+            set(image_ids_per_dataset["train"])
+            & set(image_ids_per_dataset["test"])
+        )
+        == 0
+    )
+    assert (
+        len(
+            set(image_ids_per_dataset["train"])
+            & set(image_ids_per_dataset["val"])
+        )
+        == 0
+    )
+    assert (
+        len(
+            set(image_ids_per_dataset["test"])
+            & set(image_ids_per_dataset["val"])
+        )
+        == 0
+    )
+
+    # Check splits are reproducible
+    # we check we always get the same indices from the dataset
+    # we input to `random_split` given the same seed
+    # Note that the indices are not the same as the image IDs!
+    assert train_dataset.indices[:3] == expected_indices["train"]
+    assert test_dataset.indices[:3] == expected_indices["test"]
+    assert val_dataset.indices[:3] == expected_indices["val"]

From 17501d3f0ed0eda9d0c1062ccd16ea0b9a581de5 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 21 Oct 2024 10:36:21 +0100
Subject: [PATCH 04/17] Simplify dummy dataset fixture

---
 tests/test_unit/test_datamodules.py | 32 +++++++++++++++++++----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 9e2c40dd..7b2e5f7e 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -111,20 +111,30 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip):
 
 @pytest.fixture
 def dummy_dataset():
-    """Create dummy images and annotations for testing."""
-    num_samples = 5
-    images = [torch.randn(3, 256, 256) for _ in range(num_samples)]
+    """Create dummy images and annotations for testing.
+
+    The dataset consists of 5 images, with a random number of bounding boxes
+    per image. The bounding boxes have fixed width and height, but their location
+    is randomised.
+    """
+    n_images = 5
+    img_size = 256
+    fixed_width_height = 10
+
+    images = [torch.randn(3, img_size, img_size) for _ in range(n_images)]
     annotations = []
-    for _ in range(num_samples):
+    for _ in range(n_images):
         # Generate random number of bounding boxes for each image
-        num_boxes = random.randint(1, 5)
+        n_bboxes = random.randint(1, 5)
         boxes = []
-        for _ in range(num_boxes):
-            # Generate random bounding box coordinates within image size
-            x_min = random.randint(0, 200)
-            y_min = random.randint(0, 200)
-            x_max = random.randint(x_min + 10, 256)
-            y_max = random.randint(y_min + 10, 256)
+        for _ in range(n_bboxes):
+            # Randomise the location of the top left corner of the bounding box
+            x_min = random.randint(0, img_size - fixed_width_height)
+            y_min = random.randint(0, img_size - fixed_width_height)
+
+            # Add fixed width and height to get the bottom right corner
+            x_max = x_min + fixed_width_height
+            y_max = y_min + fixed_width_height
             boxes.append([x_min, y_min, x_max, y_max])
         annotations.append(torch.tensor(boxes))
     return images, annotations

From d54ee2572a8f979298d2944d121e1be15a0ac6cc Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 21 Oct 2024 10:37:16 +0100
Subject: [PATCH 05/17] Add dummy dataset directories fixture

---
 tests/test_unit/test_datamodules.py | 56 +++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 15 deletions(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 7b2e5f7e..e94a47a9 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -5,6 +5,7 @@
 import torch
 import torchvision.transforms.v2 as transforms
 import yaml  # type: ignore
+from torchvision.utils import save_image
 
 from crabs.detector.datamodules import CrabsDataModule
 
@@ -24,17 +25,6 @@ def default_train_config():
         return yaml.safe_load(f)
 
 
-@pytest.fixture
-def crabs_data_module_realistic(default_train_config):
-    return CrabsDataModule(
-        list_img_dirs=["dir1"],
-        list_annotation_files=["anno1"],
-        config=default_train_config,
-        split_seed=123,
-        no_data_augmentation=False,
-    )
-
-
 @pytest.fixture
 def crabs_data_module_with_data_augm(default_train_config):
     return CrabsDataModule(
@@ -140,6 +130,33 @@ def dummy_dataset():
     return images, annotations
 
 
+@pytest.fixture(scope="session")
+def dummy_dataset_dirs(dummy_dataset, tmp_path_factory):
+    """Save dummy dataset to temporary directories and return their paths."""
+
+    # Get dummy data
+    images, annotations = dummy_dataset
+
+    # Create temporary directories
+    frames_path = tmp_path_factory.mktemp("frames")
+    annotations_path = tmp_path_factory.mktemp("annotations")
+
+    # Save images to temporary directory
+    for idx, img in enumerate(images):
+        out_path = frames_path / f"frame_{idx:04d}.png"
+        save_image(img, out_path)
+
+    # Save annotations with expected format to temporary directory
+
+    # return as dict
+    dataset_dict = {
+        "frames": frames_path,
+        "annotations": annotations_path,
+    }
+
+    return dataset_dict
+
+
 @pytest.mark.parametrize(
     "crabs_data_module, expected_train_transforms",
     [
@@ -245,12 +262,21 @@ def test_collate_fn(crabs_data_module, dummy_dataset, request):
 def test_compute_splits(
     seed,
     expected_indices,
-    crabs_data_module_realistic,
+    dummy_dataset_dirs,
+    default_train_config,  # ---- edit config too?
 ):
     """Test dataset splits are reproducible and according to the requested fraction"""
 
-    # Get transforms
-    dm = crabs_data_module_realistic
+    # Create datamodule
+    dm = CrabsDataModule(
+        list_img_dirs=[dummy_dataset_dirs["frames"]],
+        list_annotation_files=[dummy_dataset_dirs["annotations"]],
+        config=default_train_config,
+        split_seed=seed,
+        no_data_augmentation=False,
+    )
+
+    # Add transforms
     train_transform = dm._get_test_val_transform()
     test_and_val_transform = dm._get_test_val_transform()
 
@@ -285,7 +311,7 @@ def test_compute_splits(
         ]
 
     # Check splits are non-overlapping in image IDs
-    # TODO: Can I improve this? it is v slow
+    # TODO: Can I improve this? it is v slow!
     assert (
         len(
             set(image_ids_per_dataset["train"])

From 85484ba18f8ef688a875c2d01384ee9e80b3ac8b Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 21 Oct 2024 12:49:41 +0100
Subject: [PATCH 06/17] Add annotations file to fixture

---
 tests/test_unit/test_datamodules.py | 93 ++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 14 deletions(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index e94a47a9..021194b8 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -1,3 +1,4 @@
+import json
 import random
 from pathlib import Path
 
@@ -99,13 +100,13 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip):
     return transform1_attrs_without_fns == transform2_attrs_without_fns
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def dummy_dataset():
     """Create dummy images and annotations for testing.
 
     The dataset consists of 5 images, with a random number of bounding boxes
     per image. The bounding boxes have fixed width and height, but their location
-    is randomised.
+    is randomized. Both images and annotations are torch tensors.
     """
     n_images = 5
     img_size = 256
@@ -130,7 +131,7 @@ def dummy_dataset():
     return images, annotations
 
 
-@pytest.fixture(scope="session")
+@pytest.fixture(scope="module")
 def dummy_dataset_dirs(dummy_dataset, tmp_path_factory):
     """Save dummy dataset to temporary directories and return their paths."""
 
@@ -138,15 +139,22 @@ def dummy_dataset_dirs(dummy_dataset, tmp_path_factory):
     images, annotations = dummy_dataset
 
     # Create temporary directories
-    frames_path = tmp_path_factory.mktemp("frames")
-    annotations_path = tmp_path_factory.mktemp("annotations")
+    frames_path = tmp_path_factory.mktemp("frames", numbered=False)
+    annotations_path = tmp_path_factory.mktemp("annotations", numbered=False)
 
     # Save images to temporary directory
+    list_img_filenames = []
     for idx, img in enumerate(images):
         out_path = frames_path / f"frame_{idx:04d}.png"
         save_image(img, out_path)
+        list_img_filenames.append(out_path.name)
 
     # Save annotations with expected format to temporary directory
+    annotations_dict = bbox_tensors_to_COCO_dict(
+        annotations, list_img_filenames
+    )
+    with open(annotations_path / "sample.json", "w") as f:
+        json.dump(annotations_dict, f)
 
     # return as dict
     dataset_dict = {
@@ -157,6 +165,64 @@ def dummy_dataset_dirs(dummy_dataset, tmp_path_factory):
     return dataset_dict
 
 
+def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None):
+    """Convert list of tensors with bounding boxes to COCO format
+    for a crab dataset.
+
+    Parameters
+    ----------
+    bbox_tensors : list[torch.Tensor]
+        List of tensors with bounding boxes for each image.
+        Each element of the list corresponds to an image, and each tensor in
+        the list contains the bounding boxes for that image. Each tensor is of
+        size (n, 4) where n is the number of bounding boxes in the image.
+        The 4 values in the second dimension are x_min, y_min, x_max, y_max.
+
+    Returns
+    -------
+    dict
+        COCO format dictionary with bounding boxes.
+    """
+    # Create list of dictionaries for images
+    list_images = []
+    for img_id, img_name in enumerate(list_img_filenames):
+        image_entry = {
+            "id": img_id + 1,  # 1-based
+            "width": 0,
+            "height": 0,
+            "file_name": img_name,
+        }
+        list_images.append(image_entry)
+
+    # Create list of dictionaries for annotations
+    list_annotations = []
+    for img_id, img_bboxes in enumerate(bbox_tensors):
+        # loop thru bboxes in image
+        for bbox_row in img_bboxes:
+            x_min, y_min, x_max, y_max = bbox_row.numpy().tolist()
+            # we convert the array to list to make it JSON serializable
+
+            annotation = {
+                "id": len(list_annotations) + 1,  # 1-based
+                "image_id": img_id,
+                "bbox": [x_min, y_min, x_max - x_min, y_max - y_min],
+                "category_id": 1,
+            }
+
+            list_annotations.append(annotation)
+
+    # Create COCO dictionary
+    coco_dict = {
+        "info": {},
+        "licenses": [],
+        "categories": [{"id": 1, "name": "crab", "supercategory": "animal"}],
+        "images": list_images,
+        "annotations": list_annotations,
+    }
+
+    return coco_dict
+
+
 @pytest.mark.parametrize(
     "crabs_data_module, expected_train_transforms",
     [
@@ -263,9 +329,13 @@ def test_compute_splits(
     seed,
     expected_indices,
     dummy_dataset_dirs,
-    default_train_config,  # ---- edit config too?
+    default_train_config,
 ):
-    """Test dataset splits are reproducible and according to the requested fraction"""
+    """Test dataset splits are reproducible and according to the requested
+    fraction"""
+
+    # Edit config to change fraction according to parametrisation?
+    # ...
 
     # Create datamodule
     dm = CrabsDataModule(
@@ -295,12 +365,6 @@ def test_compute_splits(
     assert len(val_dataset) / total_dataset_size == 0.1
 
     # Check splits are non-overlapping in image IDs
-    # --- I cannot do this because samples are tuple(image, annotation)
-    # assert len(set(train_dataset) & set(test_dataset)) == 0
-    # assert len(set(train_dataset) & set(val_dataset)) == 0
-    # assert len(set(test_dataset) & set(val_dataset)) == 0
-    # assert len(set(train_dataset) & set(test_dataset)) == 0
-
     # Compute lists of image IDs per dataset
     image_ids_per_dataset = {}
     for dataset, dataset_str in zip(
@@ -312,6 +376,7 @@ def test_compute_splits(
 
     # Check splits are non-overlapping in image IDs
     # TODO: Can I improve this? it is v slow!
+    # maybe use indices, all referred to original dataset?
     assert (
         len(
             set(image_ids_per_dataset["train"])
@@ -336,7 +401,7 @@ def test_compute_splits(
 
     # Check splits are reproducible
     # we check we always get the same indices from the dataset
-    # we input to `random_split` given the same seed
+    # that we input to `random_split`, given the same seed
     # Note that the indices are not the same as the image IDs!
     assert train_dataset.indices[:3] == expected_indices["train"]
     assert test_dataset.indices[:3] == expected_indices["test"]

From e76896c914500afe0786c29d8c798c2e58a921be Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:08:00 +0100
Subject: [PATCH 07/17] Create fixture as factory of dummy datasets

---
 tests/test_unit/test_datamodules.py | 106 ++++++++++++++++------------
 1 file changed, 62 insertions(+), 44 deletions(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 021194b8..49246754 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -101,68 +101,74 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip):
 
 
 @pytest.fixture(scope="module")
-def dummy_dataset():
-    """Create dummy images and annotations for testing.
+def create_dummy_dataset():
+    """Return a factory of dummy images and annotations for testing.
 
-    The dataset consists of 5 images, with a random number of bounding boxes
+    The dataset consists of N images, with a random number of bounding boxes
     per image. The bounding boxes have fixed width and height, but their location
     is randomized. Both images and annotations are torch tensors.
     """
-    n_images = 5
-    img_size = 256
-    fixed_width_height = 10
-
-    images = [torch.randn(3, img_size, img_size) for _ in range(n_images)]
-    annotations = []
-    for _ in range(n_images):
-        # Generate random number of bounding boxes for each image
-        n_bboxes = random.randint(1, 5)
-        boxes = []
-        for _ in range(n_bboxes):
-            # Randomise the location of the top left corner of the bounding box
-            x_min = random.randint(0, img_size - fixed_width_height)
-            y_min = random.randint(0, img_size - fixed_width_height)
-
-            # Add fixed width and height to get the bottom right corner
-            x_max = x_min + fixed_width_height
-            y_max = y_min + fixed_width_height
-            boxes.append([x_min, y_min, x_max, y_max])
-        annotations.append(torch.tensor(boxes))
-    return images, annotations
+
+    def _create_dummy_dataset(n_images):
+        # n_images = 10  # needs to be > 5 to avoid floating point errors in dataset split
+        img_size = 256
+        fixed_width_height = 10
+
+        images = [torch.randn(3, img_size, img_size) for _ in range(n_images)]
+        annotations = []
+        for _ in range(n_images):
+            # Generate random number of bounding boxes for each image
+            n_bboxes = random.randint(1, 5)
+            boxes = []
+            for _ in range(n_bboxes):
+                # Randomise the location of the top left corner of the bounding box
+                x_min = random.randint(0, img_size - fixed_width_height)
+                y_min = random.randint(0, img_size - fixed_width_height)
+
+                # Add fixed width and height to get the bottom right corner
+                x_max = x_min + fixed_width_height
+                y_max = y_min + fixed_width_height
+                boxes.append([x_min, y_min, x_max, y_max])
+            annotations.append(torch.tensor(boxes))
+        return images, annotations
+
+    return _create_dummy_dataset  # return function handle!
 
 
 @pytest.fixture(scope="module")
-def dummy_dataset_dirs(dummy_dataset, tmp_path_factory):
-    """Save dummy dataset to temporary directories and return their paths."""
+def dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory):
+    """Return a dictionary with dataset paths for testing.
+
+    The dataset corresponds to a 50-image dataset with dummy annotations
+    in COCO format.
+    """
 
     # Get dummy data
-    images, annotations = dummy_dataset
+    images, annotations = create_dummy_dataset(n_images=50)
 
     # Create temporary directories
-    frames_path = tmp_path_factory.mktemp("frames", numbered=False)
-    annotations_path = tmp_path_factory.mktemp("annotations", numbered=False)
+    frames_dir = tmp_path_factory.mktemp("frames", numbered=False)
+    annotations_dir = tmp_path_factory.mktemp("annotations", numbered=False)
+    annotations_file_path = annotations_dir / "sample.json"
 
     # Save images to temporary directory
-    list_img_filenames = []
     for idx, img in enumerate(images):
-        out_path = frames_path / f"frame_{idx:04d}.png"
+        out_path = frames_dir / f"frame_{idx:04d}.png"
         save_image(img, out_path)
-        list_img_filenames.append(out_path.name)
 
-    # Save annotations with expected format to temporary directory
-    annotations_dict = bbox_tensors_to_COCO_dict(
-        annotations, list_img_filenames
-    )
-    with open(annotations_path / "sample.json", "w") as f:
-        json.dump(annotations_dict, f)
+    # Save annotations file with expected format to temporary directory
+    annotations_dict = bbox_tensors_to_COCO_dict(annotations)
+
+    with open(annotations_file_path, "w") as f:
+        json.dump(annotations_dict, f, indent=4)  # pretty print
 
-    # return as dict
-    dataset_dict = {
-        "frames": frames_path,
-        "annotations": annotations_path,
+    # Return paths as dict
+    dataset_paths = {
+        "frames": frames_dir,
+        "annotations": annotations_file_path,
     }
 
-    return dataset_dict
+    return dataset_paths
 
 
 def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None):
@@ -177,12 +183,22 @@ def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None):
         the list contains the bounding boxes for that image. Each tensor is of
         size (n, 4) where n is the number of bounding boxes in the image.
         The 4 values in the second dimension are x_min, y_min, x_max, y_max.
+    list_img_filenames : list[str], optional
+        List of image filenames. If not provided, filenames are generated
+        as "frame_{i:04d}.png" where i is the 0-based index of the image in the
+        list of bounding boxes.
 
     Returns
     -------
     dict
         COCO format dictionary with bounding boxes.
     """
+    # Create list of image filenames if not provided
+    if list_img_filenames is None:
+        list_img_filenames = [
+            f"frame_{i:04d}.png" for i in range(len(bbox_tensors))
+        ]
+
     # Create list of dictionaries for images
     list_images = []
     for img_id, img_name in enumerate(list_img_filenames):
@@ -205,8 +221,10 @@ def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None):
             annotation = {
                 "id": len(list_annotations) + 1,  # 1-based
                 "image_id": img_id,
-                "bbox": [x_min, y_min, x_max - x_min, y_max - y_min],
                 "category_id": 1,
+                "bbox": [x_min, y_min, x_max - x_min, y_max - y_min],
+                "area": (x_max - x_min) * (y_max - y_min),
+                "iscrowd": 0,
             }
 
             list_annotations.append(annotation)

From ce3a3f87b6327a9cd148b568d48eb08d5a1eca81 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:08:35 +0100
Subject: [PATCH 08/17] Tests pass

---
 tests/test_unit/test_datamodules.py | 34 ++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 49246754..1aef0806 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -2,6 +2,7 @@
 import random
 from pathlib import Path
 
+import numpy as np
 import pytest
 import torch
 import torchvision.transforms.v2 as transforms
@@ -319,8 +320,10 @@ def test_get_test_val_transform(
         "crabs_data_module_without_data_augm",
     ],
 )
-def test_collate_fn(crabs_data_module, dummy_dataset, request):
+def test_collate_fn(crabs_data_module, create_dummy_dataset, request):
     crabs_data_module = request.getfixturevalue(crabs_data_module)
+
+    dummy_dataset = create_dummy_dataset(n_images=5)
     collated_data = crabs_data_module._collate_fn(dummy_dataset)
 
     assert len(collated_data) == len(dummy_dataset[0])  # images
@@ -339,8 +342,8 @@ def test_collate_fn(crabs_data_module, dummy_dataset, request):
 @pytest.mark.parametrize(
     "seed, expected_indices",
     [
-        (123, {"train": [1, 2, 3], "test": [1, 2, 3], "val": [1, 2, 3]}),
-        (42, {"train": [1, 2, 3], "test": [1, 2, 3], "val": [1, 2, 3]}),
+        (123, {"train": [32, 30, 0], "test": [4, 6, 2], "val": [7, 1, 8]}),
+        (42, {"train": [42, 17, 30], "test": [6, 4, 0], "val": [8, 3, 2]}),
     ],
 )
 def test_compute_splits(
@@ -354,6 +357,8 @@ def test_compute_splits(
 
     # Edit config to change fraction according to parametrisation?
     # ...
+    # TODO: test different dataset sizes
+    # TODO: test different fractions
 
     # Create datamodule
     dm = CrabsDataModule(
@@ -372,15 +377,18 @@ def test_compute_splits(
     train_dataset, _, _ = dm._compute_splits(train_transform)
     _, test_dataset, val_dataset = dm._compute_splits(test_and_val_transform)
 
-    # Check split sizes are as expected
+    # Check total size of dataset
     total_dataset_size = (
         len(train_dataset) + len(test_dataset) + len(val_dataset)
     )
-    assert total_dataset_size == 50
-    # TODO: change to np.isclose
-    assert len(train_dataset) / total_dataset_size == 0.8
-    assert len(test_dataset) / total_dataset_size == 0.1
-    assert len(val_dataset) / total_dataset_size == 0.1
+    n_frame_files = len(list(dummy_dataset_dirs["frames"].glob("*.png")))
+
+    assert total_dataset_size == n_frame_files
+
+    # Check split sizes are as expected
+    assert np.isclose(len(train_dataset) / total_dataset_size, 0.8, atol=0.05)
+    assert np.isclose(len(test_dataset) / total_dataset_size, 0.1, atol=0.05)
+    assert np.isclose(len(val_dataset) / total_dataset_size, 0.1, atol=0.05)
 
     # Check splits are non-overlapping in image IDs
     # Compute lists of image IDs per dataset
@@ -392,7 +400,6 @@ def test_compute_splits(
             sample[1]["image_id"] for sample in dataset
         ]
 
-    # Check splits are non-overlapping in image IDs
     # TODO: Can I improve this? it is v slow!
     # maybe use indices, all referred to original dataset?
     assert (
@@ -417,9 +424,10 @@ def test_compute_splits(
         == 0
     )
 
-    # Check splits are reproducible
-    # we check we always get the same indices from the dataset
-    # that we input to `random_split`, given the same seed
+    # Check splits are reproducible.
+    # We check that given the same seed, we always get the
+    # same indices. The indices refer to the input dataset to
+    # `random_split`.
     # Note that the indices are not the same as the image IDs!
     assert train_dataset.indices[:3] == expected_indices["train"]
     assert test_dataset.indices[:3] == expected_indices["test"]

From 09ac68112aebc537679778a5c35cce7c42e0a100 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:47:33 +0100
Subject: [PATCH 09/17] Make fixture factory of dataset dirs

---
 tests/test_unit/test_datamodules.py | 49 +++++++++++++++--------------
 1 file changed, 26 insertions(+), 23 deletions(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 1aef0806..9b5d30e6 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -136,40 +136,43 @@ def _create_dummy_dataset(n_images):
     return _create_dummy_dataset  # return function handle!
 
 
-@pytest.fixture(scope="module")
-def dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory):
+@pytest.fixture()
+def create_dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory):
     """Return a dictionary with dataset paths for testing.
 
-    The dataset corresponds to a 50-image dataset with dummy annotations
+    The dataset points to an N-image dataset with dummy annotations
     in COCO format.
     """
 
-    # Get dummy data
-    images, annotations = create_dummy_dataset(n_images=50)
+    def _create_dummy_dataset_dirs(n_images):
+        # Get dummy data
+        images, annotations = create_dummy_dataset(n_images)
 
-    # Create temporary directories
-    frames_dir = tmp_path_factory.mktemp("frames", numbered=False)
-    annotations_dir = tmp_path_factory.mktemp("annotations", numbered=False)
-    annotations_file_path = annotations_dir / "sample.json"
+        # Create temporary directories
+        frames_dir = tmp_path_factory.mktemp("frames")
+        annotations_dir = tmp_path_factory.mktemp("annotations")
+        annotations_file_path = annotations_dir / "sample.json"
 
-    # Save images to temporary directory
-    for idx, img in enumerate(images):
-        out_path = frames_dir / f"frame_{idx:04d}.png"
-        save_image(img, out_path)
+        # Save images to temporary directory
+        for idx, img in enumerate(images):
+            out_path = frames_dir / f"frame_{idx:04d}.png"
+            save_image(img, out_path)
 
-    # Save annotations file with expected format to temporary directory
-    annotations_dict = bbox_tensors_to_COCO_dict(annotations)
+        # Save annotations file with expected format to temporary directory
+        annotations_dict = bbox_tensors_to_COCO_dict(annotations)
 
-    with open(annotations_file_path, "w") as f:
-        json.dump(annotations_dict, f, indent=4)  # pretty print
+        with open(annotations_file_path, "w") as f:
+            json.dump(annotations_dict, f, indent=4)  # pretty print
 
-    # Return paths as dict
-    dataset_paths = {
-        "frames": frames_dir,
-        "annotations": annotations_file_path,
-    }
+        # Return paths as dict
+        dataset_paths = {
+            "frames": frames_dir,
+            "annotations": annotations_file_path,
+        }
+
+        return dataset_paths
 
-    return dataset_paths
+    return _create_dummy_dataset_dirs  # return function handle!
 
 
 def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None):

From 215e3663f1e67f754d58a936ab806eb94246c776 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:48:05 +0100
Subject: [PATCH 10/17] Expand test parametrisation

---
 tests/test_unit/test_datamodules.py | 99 ++++++++++++++++++++---------
 1 file changed, 68 insertions(+), 31 deletions(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 9b5d30e6..0da73564 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -101,7 +101,7 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip):
     return transform1_attrs_without_fns == transform2_attrs_without_fns
 
 
-@pytest.fixture(scope="module")
+@pytest.fixture()
 def create_dummy_dataset():
     """Return a factory of dummy images and annotations for testing.
 
@@ -343,40 +343,66 @@ def test_collate_fn(crabs_data_module, create_dummy_dataset, request):
 
 
 @pytest.mark.parametrize(
-    "seed, expected_indices",
+    "dataset_size, seed, train_fraction, val_over_test_fraction, expected_img_ids_per_split",
     [
-        (123, {"train": [32, 30, 0], "test": [4, 6, 2], "val": [7, 1, 8]}),
-        (42, {"train": [42, 17, 30], "test": [6, 4, 0], "val": [8, 3, 2]}),
+        (
+            50,
+            123,
+            0.8,
+            0.5,
+            {"train": [33, 31, 1], "test": [21, 44, 41], "val": [36, 40, 27]},
+        ),
+        (
+            100,
+            42,
+            0.6,
+            0.5,
+            {"train": [43, 97, 63], "test": [9, 66, 1], "val": [73, 91, 86]},
+        ),
+        (
+            250,
+            37,
+            0.6,
+            0.25,
+            {
+                "train": [32, 50, 119],
+                "test": [107, 9, 68],
+                "val": [199, 180, 168],
+            },
+        ),
     ],
 )
 def test_compute_splits(
+    dataset_size,
     seed,
-    expected_indices,
-    dummy_dataset_dirs,
+    train_fraction,
+    val_over_test_fraction,
+    expected_img_ids_per_split,
+    create_dummy_dataset_dirs,
     default_train_config,
 ):
     """Test dataset splits are reproducible and according to the requested
     fraction"""
 
-    # Edit config to change fraction according to parametrisation?
-    # ...
-    # TODO: test different dataset sizes
-    # TODO: test different fractions
+    # Create a dummy dataset and get directories
+    dataset_dirs = create_dummy_dataset_dirs(n_images=dataset_size)
+
+    # Edit config to change splits' fractions
+    default_train_config["train_fraction"] = train_fraction
+    default_train_config["val_over_test_fraction"] = val_over_test_fraction
 
     # Create datamodule
     dm = CrabsDataModule(
-        list_img_dirs=[dummy_dataset_dirs["frames"]],
-        list_annotation_files=[dummy_dataset_dirs["annotations"]],
+        list_img_dirs=[dataset_dirs["frames"]],
+        list_annotation_files=[dataset_dirs["annotations"]],
         config=default_train_config,
         split_seed=seed,
         no_data_augmentation=False,
     )
 
-    # Add transforms
+    # Compute splits
     train_transform = dm._get_test_val_transform()
     test_and_val_transform = dm._get_test_val_transform()
-
-    # Compute splits
     train_dataset, _, _ = dm._compute_splits(train_transform)
     _, test_dataset, val_dataset = dm._compute_splits(test_and_val_transform)
 
@@ -384,14 +410,23 @@ def test_compute_splits(
     total_dataset_size = (
         len(train_dataset) + len(test_dataset) + len(val_dataset)
     )
-    n_frame_files = len(list(dummy_dataset_dirs["frames"].glob("*.png")))
-
+    n_frame_files = len(list(dataset_dirs["frames"].glob("*.png")))
     assert total_dataset_size == n_frame_files
 
-    # Check split sizes are as expected
-    assert np.isclose(len(train_dataset) / total_dataset_size, 0.8, atol=0.05)
-    assert np.isclose(len(test_dataset) / total_dataset_size, 0.1, atol=0.05)
-    assert np.isclose(len(val_dataset) / total_dataset_size, 0.1, atol=0.05)
+    # Check split sizes match requested fractions
+    assert np.isclose(
+        len(train_dataset) / total_dataset_size, train_fraction, atol=0.05
+    )
+    assert np.isclose(
+        len(test_dataset) / total_dataset_size,
+        (1.0 - train_fraction) * (1.0 - val_over_test_fraction),
+        atol=0.05,
+    )
+    assert np.isclose(
+        len(val_dataset) / total_dataset_size,
+        (1.0 - train_fraction) * val_over_test_fraction,
+        atol=0.05,
+    )
 
     # Check splits are non-overlapping in image IDs
     # Compute lists of image IDs per dataset
@@ -403,8 +438,6 @@ def test_compute_splits(
             sample[1]["image_id"] for sample in dataset
         ]
 
-    # TODO: Can I improve this? it is v slow!
-    # maybe use indices, all referred to original dataset?
     assert (
         len(
             set(image_ids_per_dataset["train"])
@@ -427,11 +460,15 @@ def test_compute_splits(
         == 0
     )
 
-    # Check splits are reproducible.
-    # We check that given the same seed, we always get the
-    # same indices. The indices refer to the input dataset to
-    # `random_split`.
-    # Note that the indices are not the same as the image IDs!
-    assert train_dataset.indices[:3] == expected_indices["train"]
-    assert test_dataset.indices[:3] == expected_indices["test"]
-    assert val_dataset.indices[:3] == expected_indices["val"]
+    # Check dataset creation is reproducible by checking
+    # the first 3 image IDs are as expected
+    assert (
+        image_ids_per_dataset["train"][:3]
+        == expected_img_ids_per_split["train"]
+    )
+    assert (
+        image_ids_per_dataset["test"][:3] == expected_img_ids_per_split["test"]
+    )
+    assert (
+        image_ids_per_dataset["val"][:3] == expected_img_ids_per_split["val"]
+    )

From 11f481c9f8230f36660da1169d6bf6ff056c062b Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 21 Oct 2024 16:52:37 +0100
Subject: [PATCH 11/17] Small edits to docstrings

---
 tests/test_unit/test_datamodules.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 0da73564..c6c5b552 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -138,7 +138,7 @@ def _create_dummy_dataset(n_images):
 
 @pytest.fixture()
 def create_dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory):
-    """Return a dictionary with dataset paths for testing.
+    """Return a factory of dictionaries with dataset paths for testing.
 
     The dataset points to an N-image dataset with dummy annotations
     in COCO format.
@@ -324,22 +324,23 @@ def test_get_test_val_transform(
     ],
 )
 def test_collate_fn(crabs_data_module, create_dummy_dataset, request):
+    """Test collate function formats the dataset as expected."""
     crabs_data_module = request.getfixturevalue(crabs_data_module)
 
-    dummy_dataset = create_dummy_dataset(n_images=5)
-    collated_data = crabs_data_module._collate_fn(dummy_dataset)
+    dataset = create_dummy_dataset(n_images=5)
+    collated_data = crabs_data_module._collate_fn(dataset)
 
-    assert len(collated_data) == len(dummy_dataset[0])  # images
-    assert len(collated_data) == len(dummy_dataset[1])  # annotations
+    assert len(collated_data) == len(dataset[0])  # images
+    assert len(collated_data) == len(dataset[1])  # annotations
 
     for i, sample in enumerate(collated_data):
-        # check length
+        # check length is 2 -> (image, annotation)
         assert len(sample) == 2
 
-        # check same content as in dummy dataset
+        # check content is the same as in input dataset
         image, annotation = sample
-        assert torch.equal(image, dummy_dataset[0][i])
-        assert torch.equal(annotation, dummy_dataset[1][i])
+        assert torch.equal(image, dataset[0][i])
+        assert torch.equal(annotation, dataset[1][i])
 
 
 @pytest.mark.parametrize(

From 870b524145b58e7b07681c2ef807975596773a0f Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 23 Oct 2024 18:51:33 +0100
Subject: [PATCH 12/17] Move bbox_tensors_to_COCO_dict to utils

---
 crabs/detector/utils/detection.py   | 74 ++++++++++++++++++++++++++-
 tests/test_unit/test_datamodules.py | 79 ++---------------------------
 2 files changed, 78 insertions(+), 75 deletions(-)

diff --git a/crabs/detector/utils/detection.py b/crabs/detector/utils/detection.py
index b2f82ba6..df4de718 100644
--- a/crabs/detector/utils/detection.py
+++ b/crabs/detector/utils/detection.py
@@ -4,8 +4,9 @@
 import datetime
 import os
 from pathlib import Path
-from typing import Any
+from typing import Any, Optional
 
+import torch
 from lightning.pytorch.loggers import MLFlowLogger
 
 DEFAULT_ANNOTATIONS_FILENAME = "VIA_JSON_combined_coco_gen.json"
@@ -242,3 +243,74 @@ def slurm_logs_as_artifacts(logger: MLFlowLogger, slurm_job_id: str):
             logger.run_id,
             f"{log_filename}.{ext}",
         )
+
+
+def bbox_tensors_to_COCO_dict(
+    bbox_tensors: torch.Tensor, list_img_filenames: Optional[list] = None
+) -> dict:
+    """Convert list of bounding boxes as tensors to COCO-crab format.
+
+    Parameters
+    ----------
+    bbox_tensors : list[torch.Tensor]
+        List of tensors with bounding boxes for each image.
+        Each element of the list corresponds to an image, and each tensor in
+        the list contains the bounding boxes for that image. Each tensor is of
+        size (n, 4) where n is the number of bounding boxes in the image.
+        The 4 values in the second dimension are x_min, y_min, x_max, y_max.
+    list_img_filenames : list[str], optional
+        List of image filenames. If not provided, filenames are generated
+        as "frame_{i:04d}.png" where i is the 0-based index of the image in the
+        list of bounding boxes.
+
+    Returns
+    -------
+    dict
+        COCO format dictionary with bounding boxes.
+    """
+    # Create list of image filenames if not provided
+    if list_img_filenames is None:
+        list_img_filenames = [
+            f"frame_{i:04d}.png" for i in range(len(bbox_tensors))
+        ]
+
+    # Create list of dictionaries for images
+    list_images: list[dict] = []
+    for img_id, img_name in enumerate(list_img_filenames):
+        image_entry = {
+            "id": img_id + 1,  # 1-based
+            "width": 0,
+            "height": 0,
+            "file_name": img_name,
+        }
+        list_images.append(image_entry)
+
+    # Create list of dictionaries for annotations
+    list_annotations: list[dict] = []
+    for img_id, img_bboxes in enumerate(bbox_tensors):
+        # loop thru bboxes in image
+        for bbox_row in img_bboxes:
+            x_min, y_min, x_max, y_max = bbox_row.numpy().tolist()
+            # we convert the array to list to make it JSON serializable
+
+            annotation = {
+                "id": len(list_annotations) + 1,  # 1-based
+                "image_id": img_id,
+                "category_id": 1,
+                "bbox": [x_min, y_min, x_max - x_min, y_max - y_min],
+                "area": (x_max - x_min) * (y_max - y_min),
+                "iscrowd": 0,
+            }
+
+            list_annotations.append(annotation)
+
+    # Create COCO dictionary
+    coco_dict = {
+        "info": {},
+        "licenses": [],
+        "categories": [{"id": 1, "name": "crab", "supercategory": "animal"}],
+        "images": list_images,
+        "annotations": list_annotations,
+    }
+
+    return coco_dict
diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index c6c5b552..79abbf6d 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -10,6 +10,7 @@
 from torchvision.utils import save_image
 
 from crabs.detector.datamodules import CrabsDataModule
+from crabs.detector.utils.detection import bbox_tensors_to_COCO_dict
 
 DEFAULT_CONFIG = (
     Path(__file__).parents[2]
@@ -105,7 +106,7 @@ def compare_transforms_attrs_excluding(transform1, transform2, keys_to_skip):
 def create_dummy_dataset():
     """Return a factory of dummy images and annotations for testing.
 
-    The dataset consists of N images, with a random number of bounding boxes
+    The created datasets consist of N images, with a random number of bounding boxes
     per image. The bounding boxes have fixed width and height, but their location
     is randomized. Both images and annotations are torch tensors.
     """
@@ -133,14 +134,14 @@ def _create_dummy_dataset(n_images):
             annotations.append(torch.tensor(boxes))
         return images, annotations
 
-    return _create_dummy_dataset  # return function handle!
+    return _create_dummy_dataset
 
 
 @pytest.fixture()
 def create_dummy_dataset_dirs(create_dummy_dataset, tmp_path_factory):
     """Return a factory of dictionaries with dataset paths for testing.
 
-    The dataset points to an N-image dataset with dummy annotations
+    The linked datasets are N-images datasets with dummy annotations
     in COCO format.
     """
 
@@ -172,77 +173,7 @@ def _create_dummy_dataset_dirs(n_images):
 
         return dataset_paths
 
-    return _create_dummy_dataset_dirs  # return function handle!
-
-
-def bbox_tensors_to_COCO_dict(bbox_tensors, list_img_filenames=None):
-    """Convert list of tensors with bounding boxes to COCO format
-    for a crab dataset.
-
-    Parameters
-    ----------
-    bbox_tensors : list[torch.Tensor]
-        List of tensors with bounding boxes for each image.
-        Each element of the list corresponds to an image, and each tensor in
-        the list contains the bounding boxes for that image. Each tensor is of
-        size (n, 4) where n is the number of bounding boxes in the image.
-        The 4 values in the second dimension are x_min, y_min, x_max, y_max.
-    list_img_filenames : list[str], optional
-        List of image filenames. If not provided, filenames are generated
-        as "frame_{i:04d}.png" where i is the 0-based index of the image in the
-        list of bounding boxes.
-
-    Returns
-    -------
-    dict
-        COCO format dictionary with bounding boxes.
-    """
-    # Create list of image filenames if not provided
-    if list_img_filenames is None:
-        list_img_filenames = [
-            f"frame_{i:04d}.png" for i in range(len(bbox_tensors))
-        ]
-
-    # Create list of dictionaries for images
-    list_images = []
-    for img_id, img_name in enumerate(list_img_filenames):
-        image_entry = {
-            "id": img_id + 1,  # 1-based
-            "width": 0,
-            "height": 0,
-            "file_name": img_name,
-        }
-        list_images.append(image_entry)
-
-    # Create list of dictionaries for annotations
-    list_annotations = []
-    for img_id, img_bboxes in enumerate(bbox_tensors):
-        # loop thru bboxes in image
-        for bbox_row in img_bboxes:
-            x_min, y_min, x_max, y_max = bbox_row.numpy().tolist()
-            # we convert the array to list to make it JSON serializable
-
-            annotation = {
-                "id": len(list_annotations) + 1,  # 1-based
-                "image_id": img_id,
-                "category_id": 1,
-                "bbox": [x_min, y_min, x_max - x_min, y_max - y_min],
-                "area": (x_max - x_min) * (y_max - y_min),
-                "iscrowd": 0,
-            }
-
-            list_annotations.append(annotation)
-
-    # Create COCO dictionary
-    coco_dict = {
-        "info": {},
-        "licenses": [],
-        "categories": [{"id": 1, "name": "crab", "supercategory": "animal"}],
-        "images": list_images,
-        "annotations": list_annotations,
-    }
-
-    return coco_dict
+    return _create_dummy_dataset_dirs
 
 
 @pytest.mark.parametrize(

From 833623673f47be6cb41fb17fa12a9478aee89d2f Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 23 Oct 2024 18:58:18 +0100
Subject: [PATCH 13/17] Clarify comment

---
 tests/test_unit/test_datamodules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 79abbf6d..4f9ca86d 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -316,7 +316,7 @@ def test_compute_splits(
     """Test dataset splits are reproducible and according to the requested
     fraction"""
 
-    # Create a dummy dataset and get directories
+    # Create a dummy dataset and get paths to its directories
     dataset_dirs = create_dummy_dataset_dirs(n_images=dataset_size)
 
     # Edit config to change splits' fractions

From 07a752ab8aa3fba869accbe93bf0c68e869af260 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Wed, 23 Oct 2024 18:59:53 +0100
Subject: [PATCH 14/17] Restore pre-commit config

---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5632813f..e16d66da 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -30,8 +30,8 @@ repos:
         additional_dependencies:
           - types-setuptools
   - repo: https://github.com/mgedmin/check-manifest
-    rev: "0.50"
+    rev: "0.49"
     hooks:
       - id: check-manifest
         args: [--no-build-isolation]
-        additional_dependencies: [setuptools-scm, wheel]
+        additional_dependencies: [setuptools-scm]

From c678df6c57d7e1e695688d6246035550120d81b5 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Thu, 24 Oct 2024 17:02:43 +0100
Subject: [PATCH 15/17] Rename generators

---
 crabs/detector/datamodules.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/crabs/detector/datamodules.py b/crabs/detector/datamodules.py
index 86e2cfec..2be2744f 100644
--- a/crabs/detector/datamodules.py
+++ b/crabs/detector/datamodules.py
@@ -169,11 +169,12 @@ def _compute_splits(
             A tuple with the train, test and validation datasets
         """
 
-        # Optionally fix the generator for a reproducible split of data
-        generator_1, generator_2 = None, None
+        # Optionally fix the random number generators for reproducible
+        # splits of data
+        rng_train_split, rng_val_split = None, None
         if self.split_seed:
-            generator_1 = torch.Generator().manual_seed(self.split_seed)
-            generator_2 = torch.Generator().manual_seed(self.split_seed)
+            rng_train_split = torch.Generator().manual_seed(self.split_seed)
+            rng_val_split = torch.Generator().manual_seed(self.split_seed)
 
         # Create dataset (combining all datasets passed)
         full_dataset = CrabsCocoDetection(
@@ -190,7 +191,7 @@ def _compute_splits(
         train_dataset, test_val_dataset = random_split(
             full_dataset,
             [self.config["train_fraction"], 1 - self.config["train_fraction"]],
-            generator=generator_1,
+            generator=rng_train_split,
         )
 
         # Split test/val sets from the remainder
@@ -200,7 +201,7 @@ def _compute_splits(
                 1 - self.config["val_over_test_fraction"],
                 self.config["val_over_test_fraction"],
             ],
-            generator=generator_2,
+            generator=rng_val_split,
         )
 
         return train_dataset, test_dataset, val_dataset

From 206934f986e57c6c788c5a9b4b9736c28eca7005 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 28 Oct 2024 15:21:43 +0000
Subject: [PATCH 16/17] Fix image ID default indexing (it is 1-based in VIA
 tool)

---
 crabs/detector/utils/detection.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/crabs/detector/utils/detection.py b/crabs/detector/utils/detection.py
index df4de718..dd51bda3 100644
--- a/crabs/detector/utils/detection.py
+++ b/crabs/detector/utils/detection.py
@@ -294,8 +294,9 @@ def bbox_tensors_to_COCO_dict(
             # we convert the array to list to make it JSON serializable
 
             annotation = {
-                "id": len(list_annotations) + 1,  # 1-based
-                "image_id": img_id,
+                "id": len(list_annotations)
+                + 1,  # 1-based by default in VIA tool
+                "image_id": img_id + 1,  # 1-based by default in VIA tool
                 "category_id": 1,
                 "bbox": [x_min, y_min, x_max - x_min, y_max - y_min],
                 "area": (x_max - x_min) * (y_max - y_min),

From fb1ad0649bd19af5b3461407f64a08d190f496f0 Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Mon, 28 Oct 2024 15:24:32 +0000
Subject: [PATCH 17/17] Remove n=10 comment in the dummy dataset fixture

---
 tests/test_unit/test_datamodules.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_unit/test_datamodules.py b/tests/test_unit/test_datamodules.py
index 4f9ca86d..2a823605 100644
--- a/tests/test_unit/test_datamodules.py
+++ b/tests/test_unit/test_datamodules.py
@@ -112,7 +112,11 @@ def create_dummy_dataset():
     """
 
     def _create_dummy_dataset(n_images):
-        # n_images = 10  # needs to be > 5 to avoid floating point errors in dataset split
+        """Create a dataset with N images and random bounding boxes per image.
+
+        The number of images in the dataset needs to be > 5 to avoid floating point errors
+        in the dataset split.
+        """
         img_size = 256
         fixed_width_height = 10