Skip to content

Commit

Permalink
Load examples from JSONLines Dataset without duplication.
Browse files Browse the repository at this point in the history
  • Loading branch information
mitsuse committed Nov 4, 2024
1 parent fa64411 commit 2d3ef0b
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 3 deletions.
2 changes: 1 addition & 1 deletion src/torch_wae/cli/convert_dataset_to_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def main(
)

output.mkdir(parents=True, exist_ok=True)
pattern = str(output / "%06d.tar")
pattern = str(output / "%04d.tar")
max_size = size_shard * 1024**2

with tqdm(total=n) as progress:
Expand Down
12 changes: 10 additions & 2 deletions src/torch_wae/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,17 @@ def __init__(self, path: Path) -> None:
self.__path = path

def __iter__(self) -> Iterator[Any]:
worker_info = data.get_worker_info()

with self.__path.open() as f:
for line in f:
yield json.loads(line)
for i, line in enumerate(f):
if worker_info is None:
yield json.loads(line)
else:
worker_id = worker_info.id
num_workers = worker_info.num_workers
if i % num_workers == worker_id:
yield json.loads(line)


@dataclass(frozen=True)
Expand Down

0 comments on commit 2d3ef0b

Please sign in to comment.