-
Notifications
You must be signed in to change notification settings - Fork 452
/
train.py
168 lines (148 loc) · 6.84 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# This work is made available under the Nvidia Source Code License-NC.
# To view a copy of this license, check out LICENSE.md
import argparse
import os
import sys
import random
import torch.autograd.profiler as profiler
import wandb
import imaginaire.config
from imaginaire.config import Config
from imaginaire.utils.cudnn import init_cudnn
from imaginaire.utils.dataset import get_train_and_val_dataloader
from imaginaire.utils.distributed import init_dist, is_master, get_world_size
from imaginaire.utils.distributed import master_only_print as print
from imaginaire.utils.gpu_affinity import set_affinity
from imaginaire.utils.misc import slice_tensor
from imaginaire.utils.logging import init_logging, make_logging_dir
from imaginaire.utils.trainer import (get_model_optimizer_and_scheduler,
get_trainer, set_random_seed)
sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
def parse_args():
parser = argparse.ArgumentParser(description='Training')
parser.add_argument('--config',
help='Path to the training config file.', required=True)
parser.add_argument('--logdir', help='Dir for saving logs and models.')
parser.add_argument('--checkpoint', default='', help='Checkpoint path.')
parser.add_argument('--seed', type=int, default=2, help='Random seed.')
parser.add_argument('--randomized_seed', action='store_true', help='Use a random seed between 0-10000.')
parser.add_argument('--local_rank', type=int, default=os.getenv('LOCAL_RANK', 0))
parser.add_argument('--single_gpu', action='store_true')
parser.add_argument('--debug', action='store_true')
parser.add_argument('--use_jit', action='store_true')
parser.add_argument('--profile', action='store_true')
parser.add_argument('--wandb', action='store_true')
parser.add_argument('--wandb_name', default='default', type=str)
parser.add_argument('--wandb_id', type=str)
parser.add_argument('--resume', type=int)
parser.add_argument('--num_workers', type=int)
args = parser.parse_args()
return args
def main():
args = parse_args()
set_affinity(args.local_rank)
if args.randomized_seed:
args.seed = random.randint(0, 10000)
set_random_seed(args.seed, by_rank=True)
cfg = Config(args.config)
try:
from userlib.auto_resume import AutoResume
AutoResume.init()
except: # noqa
pass
# If args.single_gpu is set to True,
# we will disable distributed data parallel
if not args.single_gpu:
cfg.local_rank = args.local_rank
init_dist(cfg.local_rank)
print(f"Training with {get_world_size()} GPUs.")
# Global arguments.
imaginaire.config.DEBUG = args.debug
imaginaire.config.USE_JIT = args.use_jit
# Override the number of data loading workers if necessary
if args.num_workers is not None:
cfg.data.num_workers = args.num_workers
# Create log directory for storing training results.
cfg.date_uid, cfg.logdir = init_logging(args.config, args.logdir)
make_logging_dir(cfg.logdir)
# Initialize cudnn.
init_cudnn(cfg.cudnn.deterministic, cfg.cudnn.benchmark)
# Initialize data loaders and models.
batch_size = cfg.data.train.batch_size
total_step = max(cfg.trainer.dis_step, cfg.trainer.gen_step)
cfg.data.train.batch_size *= total_step
train_data_loader, val_data_loader = get_train_and_val_dataloader(cfg, args.seed)
net_G, net_D, opt_G, opt_D, sch_G, sch_D = \
get_model_optimizer_and_scheduler(cfg, seed=args.seed)
trainer = get_trainer(cfg, net_G, net_D,
opt_G, opt_D,
sch_G, sch_D,
train_data_loader, val_data_loader)
resumed, current_epoch, current_iteration = trainer.load_checkpoint(cfg, args.checkpoint, args.resume)
# Initialize Wandb.
if is_master():
if args.wandb_id is not None:
wandb_id = args.wandb_id
else:
if resumed and os.path.exists(os.path.join(cfg.logdir, 'wandb_id.txt')):
with open(os.path.join(cfg.logdir, 'wandb_id.txt'), 'r+') as f:
wandb_id = f.read()
else:
wandb_id = wandb.util.generate_id()
with open(os.path.join(cfg.logdir, 'wandb_id.txt'), 'w+') as f:
f.write(wandb_id)
wandb_mode = "disabled" if (args.debug or not args.wandb) else "online"
wandb.init(id=wandb_id,
project=args.wandb_name,
config=cfg,
name=os.path.basename(cfg.logdir),
resume="allow",
settings=wandb.Settings(start_method="fork"),
mode=wandb_mode)
wandb.config.update({'dataset': cfg.data.name})
wandb.watch(trainer.net_G_module)
wandb.watch(trainer.net_D.module)
# Start training.
for epoch in range(current_epoch, cfg.max_epoch):
print('Epoch {} ...'.format(epoch))
if not args.single_gpu:
train_data_loader.sampler.set_epoch(current_epoch)
trainer.start_of_epoch(current_epoch)
for it, data in enumerate(train_data_loader):
with profiler.profile(enabled=args.profile,
use_cuda=True,
profile_memory=True,
record_shapes=True) as prof:
data = trainer.start_of_iteration(data, current_iteration)
for i in range(cfg.trainer.dis_step):
trainer.dis_update(
slice_tensor(data, i * batch_size,
(i + 1) * batch_size))
for i in range(cfg.trainer.gen_step):
trainer.gen_update(
slice_tensor(data, i * batch_size,
(i + 1) * batch_size))
current_iteration += 1
trainer.end_of_iteration(data, current_epoch, current_iteration)
if current_iteration >= cfg.max_iter:
print('Done with training!!!')
return
if args.profile:
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))
prof.export_chrome_trace(os.path.join(cfg.logdir, "trace.json"))
try:
if AutoResume.termination_requested():
trainer.save_checkpoint(current_epoch, current_iteration)
AutoResume.request_resume()
print("Training terminated. Returning")
return 0
except: # noqa
pass
current_epoch += 1
trainer.end_of_epoch(data, current_epoch, current_iteration)
print('Done with training!!!')
return
if __name__ == "__main__":
main()