You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Great work! When training ,I got a Problem when the num_workers>0: ZeroDivisionError: integer division or modulo by zero,Training Log:
[ INFO : 2024-12-10 14:03:57,024 ] - <========== Training process ==========>
[ INFO : 2024-12-10 14:03:57,025 ] - +----------+----------+----------+----------+----------+
[ INFO : 2024-12-10 14:03:57,025 ] - | Train/Val| Epoch| iter| Loss| LR|
[ INFO : 2024-12-10 14:03:57,025 ] - +----------+----------+----------+----------+----------+
[ INFO : 2024-12-10 14:04:04,552 ] - Reducer buckets have been rebuilt in this iteration.
[ INFO : 2024-12-10 14:04:04,552 ] - Reducer buckets have been rebuilt in this iteration.
[ INFO : 2024-12-10 14:04:04,552 ] - Reducer buckets have been rebuilt in this iteration.
[ INFO : 2024-12-10 14:04:04,552 ] - Reducer buckets have been rebuilt in this iteration.
[ INFO : 2024-12-10 14:08:20,135 ] - | TRAIN| 1| 100| 0.72532|0.00042771|
[ INFO : 2024-12-10 14:08:20,136 ] - | TRAIN| 1| 100| 0.72447|0.00042771|
[ INFO : 2024-12-10 14:08:20,137 ] - | TRAIN| 1| 100| 0.70751|0.00042771|
[ INFO : 2024-12-10 14:08:20,141 ] - | TRAIN| 1| 100| 0.7117|0.00042771|
[ INFO : 2024-12-10 14:12:31,773 ] - | TRAIN| 1| 200| 0.23728|0.00018138|
[ INFO : 2024-12-10 14:12:31,774 ] - | TRAIN| 1| 200| 0.15428|0.00018138|
[ INFO : 2024-12-10 14:12:31,783 ] - | TRAIN| 1| 200| 0.20809|0.00018138|
[ INFO : 2024-12-10 14:12:31,837 ] - | TRAIN| 1| 200| 0.19|0.00018138|
[ INFO : 2024-12-10 14:16:05,250 ] - | TRAIN| 1| 300| -0.08594|7.6915e-05|
[ INFO : 2024-12-10 14:16:05,253 ] - | TRAIN| 1| 300| -0.13198|7.6915e-05|
[ INFO : 2024-12-10 14:16:05,253 ] - | TRAIN| 1| 300| -0.10877|7.6915e-05|
[ INFO : 2024-12-10 14:16:05,256 ] - | TRAIN| 1| 300| -0.15461|7.6915e-05|
[ INFO : 2024-12-10 14:19:49,578 ] - | TRAIN| 1| 400| -0.31298|3.2617e-05|
[ INFO : 2024-12-10 14:19:49,579 ] - | TRAIN| 1| 400| -0.27876|3.2617e-05|
[ INFO : 2024-12-10 14:19:49,579 ] - | TRAIN| 1| 400| -0.30184|3.2617e-05|
[ INFO : 2024-12-10 14:19:49,580 ] - | TRAIN| 1| 400| -0.33785|3.2617e-05|
Traceback (most recent call last):
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/bin/train.py", line 400, in
fire.Fire(train)
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/bin/train.py", line 329, in train
val_loss, _ = executor.cv(
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/utils/executor.py", line 199, in cv
for i, batch in enumerate(dataloader):
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 681, in next
data = self._next_data()
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1402, in _process_data
data.reraise()
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/_utils.py", line 461, in reraise
raise exception
ZeroDivisionError: Caught ZeroDivisionError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 32, in fetch
data.append(next(self.dataset_iter))
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 464, in sample_fix_spk_enrollment
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 622, in random_chunk
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 376, in resample
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 73, in tar_file_and_group
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 42, in url_opener
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/dataset.py", line 140, in iter
index = indexes[counter % indexes_len]
ZeroDivisionError: integer division or modulo by zero
The text was updated successfully, but these errors were encountered:
Great work! When training ,I got a Problem when the num_workers>0: ZeroDivisionError: integer division or modulo by zero,Training Log:
[ INFO : 2024-12-10 14:03:57,024 ] - <========== Training process ==========>
[ INFO : 2024-12-10 14:03:57,025 ] - +----------+----------+----------+----------+----------+
[ INFO : 2024-12-10 14:03:57,025 ] - | Train/Val| Epoch| iter| Loss| LR|
[ INFO : 2024-12-10 14:03:57,025 ] - +----------+----------+----------+----------+----------+
[ INFO : 2024-12-10 14:04:04,552 ] - Reducer buckets have been rebuilt in this iteration.
[ INFO : 2024-12-10 14:04:04,552 ] - Reducer buckets have been rebuilt in this iteration.
[ INFO : 2024-12-10 14:04:04,552 ] - Reducer buckets have been rebuilt in this iteration.
[ INFO : 2024-12-10 14:04:04,552 ] - Reducer buckets have been rebuilt in this iteration.
[ INFO : 2024-12-10 14:08:20,135 ] - | TRAIN| 1| 100| 0.72532|0.00042771|
[ INFO : 2024-12-10 14:08:20,136 ] - | TRAIN| 1| 100| 0.72447|0.00042771|
[ INFO : 2024-12-10 14:08:20,137 ] - | TRAIN| 1| 100| 0.70751|0.00042771|
[ INFO : 2024-12-10 14:08:20,141 ] - | TRAIN| 1| 100| 0.7117|0.00042771|
[ INFO : 2024-12-10 14:12:31,773 ] - | TRAIN| 1| 200| 0.23728|0.00018138|
[ INFO : 2024-12-10 14:12:31,774 ] - | TRAIN| 1| 200| 0.15428|0.00018138|
[ INFO : 2024-12-10 14:12:31,783 ] - | TRAIN| 1| 200| 0.20809|0.00018138|
[ INFO : 2024-12-10 14:12:31,837 ] - | TRAIN| 1| 200| 0.19|0.00018138|
[ INFO : 2024-12-10 14:16:05,250 ] - | TRAIN| 1| 300| -0.08594|7.6915e-05|
[ INFO : 2024-12-10 14:16:05,253 ] - | TRAIN| 1| 300| -0.13198|7.6915e-05|
[ INFO : 2024-12-10 14:16:05,253 ] - | TRAIN| 1| 300| -0.10877|7.6915e-05|
[ INFO : 2024-12-10 14:16:05,256 ] - | TRAIN| 1| 300| -0.15461|7.6915e-05|
[ INFO : 2024-12-10 14:19:49,578 ] - | TRAIN| 1| 400| -0.31298|3.2617e-05|
[ INFO : 2024-12-10 14:19:49,579 ] - | TRAIN| 1| 400| -0.27876|3.2617e-05|
[ INFO : 2024-12-10 14:19:49,579 ] - | TRAIN| 1| 400| -0.30184|3.2617e-05|
[ INFO : 2024-12-10 14:19:49,580 ] - | TRAIN| 1| 400| -0.33785|3.2617e-05|
Traceback (most recent call last):
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/bin/train.py", line 400, in
fire.Fire(train)
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/fire/core.py", line 466, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/bin/train.py", line 329, in train
val_loss, _ = executor.cv(
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/utils/executor.py", line 199, in cv
for i, batch in enumerate(dataloader):
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 681, in next
data = self._next_data()
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1376, in _next_data
return self._process_data(data)
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1402, in _process_data
data.reraise()
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/_utils.py", line 461, in reraise
raise exception
ZeroDivisionError: Caught ZeroDivisionError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/_utils/worker.py", line 302, in _worker_loop
data = fetcher.fetch(index)
File "/nfs/volume-225-13/lijunbo/.env/miniconda3/envs/wesep/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py", line 32, in fetch
data.append(next(self.dataset_iter))
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 464, in sample_fix_spk_enrollment
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 622, in random_chunk
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 376, in resample
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 73, in tar_file_and_group
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/processor.py", line 42, in url_opener
for sample in data:
File "/nfs/volume-1593-1/wushu/wesep/examples/librimix/tse/v2/wesep/dataset/dataset.py", line 140, in iter
index = indexes[counter % indexes_len]
ZeroDivisionError: integer division or modulo by zero
The text was updated successfully, but these errors were encountered: