-
Notifications
You must be signed in to change notification settings - Fork 0
/
training.job
51 lines (45 loc) · 1.3 KB
/
training.job
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
# SBATCH --share
#SBATCH --partition=pascalnodes
#SBATCH --exclusive
#
# Name your job to make it easier for you to track
#
#SBATCH --job-name=3nodes
#
# Set your error and output files
#
#SBATCH --ntasks=12
#SBATCH --gres=gpu:4
#SBATCH -N3
#
# Tell the scheduler only need 10 minutes
#
#SBATCH --time=12:00:00
#SBATCH --mem-per-cpu=4G
#SBATCH --cpus-per-task=6
#
# Set your email address and request notification when you job is complete or if it fails
#
#SBATCH --mail-type=FAIL
#SBATCH [email protected]
module load Anaconda3/5.3.0
module load cuda92/toolkit
module load NCCL/2.2.13-CUDA-9.2.148.1
module load OpenMPI/3.1.2-gcccuda-2018b
source activate distributedLearning
time mpirun -np $SLURM_NTASKS -bind-to none -map-by slot -mca pml ob1 -mca btl_tcp_if_include ib0 python /data/user/wsmonroe/MetalsGroup/pennyProject/code/horovodUnet/pennyTrain.py\
--seed=80\
--dataset_dir='/data/user/wsmonroe/MetalsGroup/pennyProject/pennyData/TrainingSet/preprocessed'\
--network='resnet50'\
--preprocessing_function='tf'\
--learning_rate=0.001\
--loss_function='bce_dice'\
--train_data_dir_name='data'\
--val_data_dir_name='validation/data'\
--val_mask_dir='validation/labels'\
--train_mask_dir='labels'\
--input_height=256\
--input_width=256\
--epochs=500\
--batch_size=6