/external/pytorch/test/distributed/fsdp/ |
D | test_fsdp_use_orig_params.py | 109 ddp_model = DDP( 114 return ddp_model 166 ddp_model: DDP, 177 for model, optim in ((ddp_model, ddp_optim), (fsdp_model, fsdp_optim)): 190 if model is ddp_model and fsdp_model.cpu_offload.offload_params: 193 if model is ddp_model and fsdp_model.cpu_offload.offload_params: 197 self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model) 199 def _check_ddp_fsdp_param_parity(self, ddp_model: DDP, fsdp_model: FSDP): 202 ddp_model.module.named_parameters(), fsdp_model.named_parameters() 358 ddp_model = self._get_ddp_transformer(find_unused_params=False) [all …]
|
D | test_fsdp_clip_grad_norm.py | 108 ddp_model = DDP(local_model, device_ids=[self.rank]) 157 ddp_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR) 161 inp = ddp_model.module.get_input(device) 162 for model in (ddp_model, fsdp_model): 172 for param in itertools.chain(ddp_model.parameters(), fsdp_model.parameters()): 178 param.grad.detach().clone() for param in ddp_model.parameters() 186 ddp_model.parameters(), 196 for param, orig_grad in zip(ddp_model.parameters(), orig_ddp_grads): 209 ddp_model.module.named_parameters(), 228 inp = ddp_model.module.get_input(device) [all …]
|
D | test_fsdp_unshard_params.py | 491 ddp_model: DDP, 509 ddp_model.module.named_parameters(), 534 for param in ddp_model.parameters(): 560 ddp_model = DDP(model, device_ids=[self.rank]) 572 for p1, p2 in zip(ddp_model.module.parameters(), fsdp_model.parameters()): 577 ddp_out = ddp_model(*inp) 583 _check_grads(ddp_model, fsdp_model, old_fsdp_grads) 587 ddp_out = ddp_model(*inp) 591 _check_grads(ddp_model, fsdp_model, old_fsdp_grads)
|
D | test_fsdp_misc.py | 191 ddp_model = torch.nn.parallel.DistributedDataParallel( 196 ddp_opt = torch.optim.SGD(ddp_model.parameters(), lr=1e-4) 207 for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)): 224 ddp_model.eval() 229 ddp_loss = ddp_model(x, y) 233 ddp_model.train() 237 for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
|
/external/pytorch/benchmarks/distributed/rpc/parameter_server/trainer/ |
D | ddp_models.py | 18 ddp_model = DDP(model, device_ids=[rank], process_group=process_group) 20 ddp_model.register_comm_hook(hook_state, hook) 21 return ddp_model, hook_state
|
D | iteration_steps.py | 2 self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch argument 19 loss = criterion(ddp_model(batch[0]), batch[1])
|
D | trainer.py | 227 ddp_model, hook_state = self.create_ddp_model( 230 optimizer = torch.optim.SGD(ddp_model.parameters(), 1e-4) 238 ddp_model,
|
/external/pytorch/test/distributed/optim/ |
D | test_zero_redundancy_optimizer.py | 886 ddp_model = DDP( 896 ddp_model, 905 ddp_loss = ddp_model(input_tensor).abs().sum() 930 ddp_model, 940 next(ddp_model.parameters()).requires_grad = bool(i % 2) 997 ddp_model = DDP(model, device_ids=[rank]) if is_gpu else DDP(model) 998 local_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR) 1016 with ddp_model.join(): 1019 output = ddp_model(input) 1023 for p in ddp_model.parameters(): [all …]
|
/external/pytorch/test/distributed/ |
D | test_c10d_common.py | 343 ddp_model = DistributedDataParallel( 356 return model, ddp_model, input, target 375 ddp_model = DistributedDataParallel( 386 return model, ddp_model, input, target 420 ddp_model = copy.deepcopy(input_model).cuda() 421 ddp_model = nn.parallel.DistributedDataParallel( 422 ddp_model, 431 ddp_model._get_ddp_logging_data().get("static_graph", 0), static_graph 438 ddp_model.zero_grad(set_to_none=False) 448 ddp_model, [all …]
|
D | test_c10d_ucc.py | 639 def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): argument 652 criterion(ddp_model(partial_input), partial_target).backward() 656 ddp_parameter = next(ddp_model.parameters()) 774 ddp_model = DistributedDataParallel( 780 self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) 957 ddp_model = DistributedDataParallel( 973 ddp_model.register_comm_hook(None, allreduce_hook_ucc) 975 self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
|
D | test_c10d_nccl.py | 987 ddp_model = DistributedDataParallel( 994 ddp_model = DistributedDataParallel( 1002 ddp_model = DistributedDataParallel(model, process_group=process_group) 1008 ddp_model = DistributedDataParallel( 1018 ddp_model = DistributedDataParallel( 1032 ddp_model.train() 1033 output = ddp_model(input) 1037 self.assertFalse(any(torch.isinf(p.grad).any() for p in ddp_model.parameters())) 1198 ddp_model = None 1218 nonlocal ddp_model [all …]
|
D | test_c10d_gloo.py | 1814 def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): argument 1827 criterion(ddp_model(partial_input), partial_target).backward() 1831 ddp_parameter = next(ddp_model.parameters()) 1949 ddp_model = DistributedDataParallel( 1955 self._run_and_verify_sparse_gradients(vanilla_model, ddp_model) 2127 ddp_model = DistributedDataParallel( 2143 ddp_model.register_comm_hook(None, allreduce_hook_gloo) 2145 self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
|
D | test_dynamo_distributed.py | 788 ddp_model = DDP(model, device_ids=[self.rank]) 791 optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) 804 output = ddp_model(data)
|
/external/pytorch/docs/source/ |
D | ddp_comm_hooks.rst | 170 ddp_model = DistributedDataParallel(model, device_ids=[rank]) 175 optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) 176 ddp_model.register_comm_hook(powersgd_state, powersgd_hook) 179 'state_dict': ddp_model.state_dict(),
|
D | distributed.rst | 690 ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) 696 output = ddp_model(inp)
|
/external/pytorch/test/distributed/checkpoint/ |
D | test_state_dict.py | 555 ddp_model = DDP(copy.deepcopy(model)) 556 set_model_state_dict(ddp_model, get_model_state_dict(ddp_model)) 558 self.assertEqual(model.state_dict(), get_model_state_dict(ddp_model)) 567 ddp_model = DDP(copy.deepcopy(model)) 568 set_model_state_dict(ddp_model, get_model_state_dict(ddp_model)) 569 self.assertEqual(model.state_dict(), get_model_state_dict(ddp_model))
|
/external/pytorch/docs/source/notes/ |
D | ddp.rst | 41 ddp_model = DDP(model, device_ids=[rank]) 44 optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) 47 outputs = ddp_model(torch.randn(20, 10).to(rank)) 76 ddp_model = DDP(model, device_ids=[rank]) 77 ddp_model = torch.compile(ddp_model)
|
/external/pytorch/torch/testing/_internal/distributed/ |
D | ddp_under_dist_autograd_test.py | 595 ddp_model = DistributedDataParallel(model) 602 loss = ddp_model(input, offsets).sum() 606 loss = ddp_model(input, offsets).sum() 640 ddp_model = DistributedDataParallel(layer2) 641 loss = ddp_model(layer1(inputs)).sum() 646 loss = ddp_model(remote_layer1(inputs)).sum()
|
D | distributed_test.py | 4423 ddp_model = nn.parallel.DistributedDataParallel( 4438 ddp_model = torch.nn.parallel.DistributedDataParallel(model) 4443 ddp_model().backward(create_graph=True) 4446 all(param.requires_grad for param in ddp_model.parameters()) 4508 ddp_model = torch.nn.parallel.DistributedDataParallel( 4512 ddp_logging_data = ddp_model._get_ddp_logging_data() 4515 ddp_model.register_comm_hook(None, hook) 4516 ddp_logging_data = ddp_model._get_ddp_logging_data() 4520 ddp_model = torch.nn.parallel.DistributedDataParallel( 4524 ddp_logging_data = ddp_model._get_ddp_logging_data() [all …]
|
/external/pytorch/test/dynamo/ |
D | test_structured_trace.py | 365 ddp_model = torch._dynamo.optimize("inductor")( 369 ddp_model(torch.randn(1024, 1024, device="cuda:0"))
|
D | test_logging.py | 230 ddp_model = torch._dynamo.optimize("inductor")( 234 ddp_model(torch.randn(1024, 1024, device="cuda:0"))
|
/external/pytorch/torch/testing/_internal/ |
D | common_fsdp.py | 583 ddp_model = NonUniformReqGradNWM( 589 NonUniformReqGradNWM._set_nonuniform_req_grad(ddp_model, req_grad_pattern) 590 return ddp_model
|