don't duplicate data in image

huggingface · Apr 20, 2022 · f8a8553 · f8a8553 · github-actions · Apr 20, 2022
1 parent abad09a
commit f8a8553
Showing 1 changed file with 6 additions and 0 deletions.
diff --git a/src/datasets/features/image.py b/src/datasets/features/image.py
@@ -1,3 +1,4 @@
+import os
 from dataclasses import dataclass, field
 from io import BytesIO
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Union
@@ -69,10 +70,15 @@ def encode_example(self, value: Union[str, dict, np.ndarray, "PIL.Image.Image"])
         if isinstance(value, str):
             return {"path": value, "bytes": None}
         elif isinstance(value, np.ndarray):
+            # convert the image array to png bytes
             image = PIL.Image.fromarray(value.astype(np.uint8))
             return {"path": None, "bytes": image_to_bytes(image)}
         elif isinstance(value, PIL.Image.Image):
+            # convert the PIL image to bytes (default format is png)
             return encode_pil_image(value)
+        elif value.get("path") is not None and os.path.isfile(value["path"]):
+            # we set "bytes": None to not duplicate the data if they're already available locally
+            return {"bytes": None, "path": value.get("path")}
         elif value.get("bytes") is not None or value.get("path") is not None:
             return {"bytes": value.get("bytes"), "path": value.get("path")}
         else: