Skip to content

Commit

Permalink
don't duplicate data in image
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Apr 20, 2022
1 parent abad09a commit f8a8553
Showing 1 changed file with 6 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/datasets/features/image.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from dataclasses import dataclass, field
from io import BytesIO
from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Union
Expand Down Expand Up @@ -69,10 +70,15 @@ def encode_example(self, value: Union[str, dict, np.ndarray, "PIL.Image.Image"])
if isinstance(value, str):
return {"path": value, "bytes": None}
elif isinstance(value, np.ndarray):
# convert the image array to png bytes
image = PIL.Image.fromarray(value.astype(np.uint8))
return {"path": None, "bytes": image_to_bytes(image)}
elif isinstance(value, PIL.Image.Image):
# convert the PIL image to bytes (default format is png)
return encode_pil_image(value)
elif value.get("path") is not None and os.path.isfile(value["path"]):
# we set "bytes": None to not duplicate the data if they're already available locally
return {"bytes": None, "path": value.get("path")}
elif value.get("bytes") is not None or value.get("path") is not None:
return {"bytes": value.get("bytes"), "path": value.get("path")}
else:
Expand Down

1 comment on commit f8a8553

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==5.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.011001 / 0.011353 (-0.000352) 0.004966 / 0.011008 (-0.006042) 0.040315 / 0.038508 (0.001807) 0.034299 / 0.023109 (0.011190) 0.366987 / 0.275898 (0.091089) 0.424292 / 0.323480 (0.100812) 0.008182 / 0.007986 (0.000196) 0.004185 / 0.004328 (-0.000143) 0.010457 / 0.004250 (0.006207) 0.037848 / 0.037052 (0.000795) 0.371705 / 0.258489 (0.113216) 0.400826 / 0.293841 (0.106985) 0.044408 / 0.128546 (-0.084138) 0.013650 / 0.075646 (-0.061996) 0.362124 / 0.419271 (-0.057147) 0.060412 / 0.043533 (0.016879) 0.382489 / 0.255139 (0.127350) 0.420467 / 0.283200 (0.137267) 0.103478 / 0.141683 (-0.038205) 2.054765 / 1.452155 (0.602610) 2.051219 / 1.492716 (0.558502)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.303971 / 0.018006 (0.285964) 0.556982 / 0.000490 (0.556493) 0.025553 / 0.000200 (0.025353) 0.000459 / 0.000054 (0.000404)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.025731 / 0.037411 (-0.011681) 0.102934 / 0.014526 (0.088408) 0.116361 / 0.176557 (-0.060196) 0.169332 / 0.737135 (-0.567804) 0.112551 / 0.296338 (-0.183788)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.588428 / 0.215209 (0.373219) 5.675467 / 2.077655 (3.597812) 2.035509 / 1.504120 (0.531389) 1.781842 / 1.541195 (0.240648) 1.768112 / 1.468490 (0.299622) 0.751752 / 4.584777 (-3.833025) 6.273561 / 3.745712 (2.527848) 4.921956 / 5.269862 (-0.347906) 1.472227 / 4.565676 (-3.093450) 0.077813 / 0.424275 (-0.346462) 0.012353 / 0.007607 (0.004746) 0.690572 / 0.226044 (0.464528) 6.999512 / 2.268929 (4.730584) 2.537996 / 55.444624 (-52.906628) 2.086409 / 6.876477 (-4.790068) 2.237566 / 2.142072 (0.095494) 0.918270 / 4.805227 (-3.886957) 0.188413 / 6.500664 (-6.312251) 0.070699 / 0.075469 (-0.004770)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.823333 / 1.841788 (-0.018454) 15.648838 / 8.074308 (7.574530) 38.027923 / 10.191392 (27.836531) 1.033317 / 0.680424 (0.352893) 0.588624 / 0.534201 (0.054423) 0.581469 / 0.579283 (0.002186) 0.694610 / 0.434364 (0.260246) 0.398155 / 0.540337 (-0.142183) 0.427942 / 1.386936 (-0.958994)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.009508 / 0.011353 (-0.001845) 0.004353 / 0.011008 (-0.006656) 0.030996 / 0.038508 (-0.007512) 0.034453 / 0.023109 (0.011344) 0.321320 / 0.275898 (0.045422) 0.351539 / 0.323480 (0.028059) 0.006753 / 0.007986 (-0.001233) 0.006546 / 0.004328 (0.002218) 0.008086 / 0.004250 (0.003836) 0.039484 / 0.037052 (0.002432) 0.311093 / 0.258489 (0.052604) 0.341824 / 0.293841 (0.047983) 0.046371 / 0.128546 (-0.082175) 0.012690 / 0.075646 (-0.062957) 0.284507 / 0.419271 (-0.134765) 0.056287 / 0.043533 (0.012754) 0.309976 / 0.255139 (0.054837) 0.368089 / 0.283200 (0.084889) 0.095768 / 0.141683 (-0.045915) 2.000440 / 1.452155 (0.548285) 1.949163 / 1.492716 (0.456447)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.391811 / 0.018006 (0.373805) 0.546761 / 0.000490 (0.546271) 0.064534 / 0.000200 (0.064334) 0.001828 / 0.000054 (0.001774)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.025865 / 0.037411 (-0.011547) 0.103594 / 0.014526 (0.089068) 0.119063 / 0.176557 (-0.057493) 0.152031 / 0.737135 (-0.585105) 0.118653 / 0.296338 (-0.177686)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.547261 / 0.215209 (0.332052) 5.541426 / 2.077655 (3.463771) 2.116298 / 1.504120 (0.612178) 1.732504 / 1.541195 (0.191309) 1.767956 / 1.468490 (0.299466) 0.675861 / 4.584777 (-3.908916) 5.791304 / 3.745712 (2.045592) 2.777824 / 5.269862 (-2.492038) 1.353275 / 4.565676 (-3.212401) 0.079526 / 0.424275 (-0.344749) 0.012169 / 0.007607 (0.004562) 0.723024 / 0.226044 (0.496980) 7.219230 / 2.268929 (4.950301) 2.770489 / 55.444624 (-52.674135) 2.141005 / 6.876477 (-4.735472) 2.147404 / 2.142072 (0.005331) 0.855451 / 4.805227 (-3.949777) 0.161310 / 6.500664 (-6.339355) 0.063691 / 0.075469 (-0.011778)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.952078 / 1.841788 (0.110290) 14.325892 / 8.074308 (6.251584) 36.877354 / 10.191392 (26.685961) 1.019885 / 0.680424 (0.339461) 0.538323 / 0.534201 (0.004122) 0.514168 / 0.579283 (-0.065115) 0.610137 / 0.434364 (0.175773) 0.336556 / 0.540337 (-0.203781) 0.367645 / 1.386936 (-1.019291)

CML watermark

Please sign in to comment.