Spaces:
Sleeping
Sleeping
File size: 1,674 Bytes
c64c726 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import os
from pathlib import Path
from typing import List, Union
import hydra
from omegaconf import DictConfig, OmegaConf
import torch
from torch.distributed import init_process_group, destroy_process_group
import torch.multiprocessing as mp
from trainer import Trainer
from utils import skip_if_run_is_over
OmegaConf.register_new_resolver("eval", eval)
@hydra.main(config_path="../config", config_name="trainer", version_base="1.3")
def main(cfg: DictConfig) -> None:
setup_visible_cuda_devices(cfg.common.devices)
world_size = torch.cuda.device_count()
root_dir = Path(hydra.utils.get_original_cwd())
if world_size < 2:
run(cfg, root_dir)
else:
mp.spawn(main_ddp, args=(world_size, cfg, root_dir), nprocs=world_size)
def main_ddp(rank: int, world_size: int, cfg: DictConfig, root_dir: Path) -> None:
setup_ddp(rank, world_size)
run(cfg, root_dir)
destroy_process_group()
@skip_if_run_is_over
def run(cfg: DictConfig, root_dir: Path) -> None:
trainer = Trainer(cfg, root_dir)
trainer.run()
def setup_ddp(rank: int, world_size: int) -> None:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "6006"
init_process_group(backend="nccl", rank=rank, world_size=world_size)
def setup_visible_cuda_devices(devices: Union[str, int, List[int]]) -> None:
if isinstance(devices, str):
if devices == "cpu":
devices = []
else:
assert devices == "all"
return
elif isinstance(devices, int):
devices = [devices]
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, devices))
if __name__ == "__main__":
main()
|