# set 5.0e-05 for 32 batch # set 1.0e-04 for 64 batch # set 2.0e-0.4 for 128 or larger batch learning_rate: 1.0e-04 beta1: 0.9 beta2: 0.999 weight_decay: 0.01 adam_epsilon: 1.0e-08 grad_clip: 1.0 batch_size: 16 accumulation_steps: 1 w_weak: 0.0 lr_scheduler: warmup_steps: 500 decay_steps: 100000 end_factor: 1.0e-02 # snr_gamma: 5.0 is used in tango, not stable, might use it in later experiment # real batch = n_gpu * batch * accumulation