"{'vocab_size': 50257, 'n_layer': 12, 'n_head': 12, 'n_embd': 768, 'n_kv_heads': 4, 'block_size': 4096, 'dropout': 0.1, 'bias': False, 'batch_size': 1, 'gradient_accumulation_steps': 64, 'max_iters': 5000, 'learning_rate': 5e-05, 'weight_decay': 0.01, 'beta1': 0.9, 'beta2': 0.99, 'grad_clip': 1.0, 'warmup_iters': 100, 'min_lr': 5e-06, 'eval_interval': 200, 'eval_iters': 20, 'log_interval': 10, 'base_repo': 'Arko007/tiny-edu-50m', 'base_checkpoint': 'checkpoint_790500.pt', 'target_repo': 'Arko007/tiny-edu-50m-instruct', 'device': 'cuda', 'dtype': 'bfloat16', 'compile': False, 'use_gradient_checkpointing': True}"