Home
last modified time | relevance | path

Searched refs:ddp_model (Results 1 – 22 of 22) sorted by relevance

/external/pytorch/test/distributed/fsdp/
Dtest_fsdp_use_orig_params.py109 ddp_model = DDP(
114 return ddp_model
166 ddp_model: DDP,
177 for model, optim in ((ddp_model, ddp_optim), (fsdp_model, fsdp_optim)):
190 if model is ddp_model and fsdp_model.cpu_offload.offload_params:
193 if model is ddp_model and fsdp_model.cpu_offload.offload_params:
197 self._check_ddp_fsdp_param_parity(ddp_model, fsdp_model)
199 def _check_ddp_fsdp_param_parity(self, ddp_model: DDP, fsdp_model: FSDP):
202 ddp_model.module.named_parameters(), fsdp_model.named_parameters()
358 ddp_model = self._get_ddp_transformer(find_unused_params=False)
[all …]
Dtest_fsdp_clip_grad_norm.py108 ddp_model = DDP(local_model, device_ids=[self.rank])
157 ddp_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR)
161 inp = ddp_model.module.get_input(device)
162 for model in (ddp_model, fsdp_model):
172 for param in itertools.chain(ddp_model.parameters(), fsdp_model.parameters()):
178 param.grad.detach().clone() for param in ddp_model.parameters()
186 ddp_model.parameters(),
196 for param, orig_grad in zip(ddp_model.parameters(), orig_ddp_grads):
209 ddp_model.module.named_parameters(),
228 inp = ddp_model.module.get_input(device)
[all …]
Dtest_fsdp_unshard_params.py491 ddp_model: DDP,
509 ddp_model.module.named_parameters(),
534 for param in ddp_model.parameters():
560 ddp_model = DDP(model, device_ids=[self.rank])
572 for p1, p2 in zip(ddp_model.module.parameters(), fsdp_model.parameters()):
577 ddp_out = ddp_model(*inp)
583 _check_grads(ddp_model, fsdp_model, old_fsdp_grads)
587 ddp_out = ddp_model(*inp)
591 _check_grads(ddp_model, fsdp_model, old_fsdp_grads)
Dtest_fsdp_misc.py191 ddp_model = torch.nn.parallel.DistributedDataParallel(
196 ddp_opt = torch.optim.SGD(ddp_model.parameters(), lr=1e-4)
207 for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
224 ddp_model.eval()
229 ddp_loss = ddp_model(x, y)
233 ddp_model.train()
237 for model, opt in ((fsdp_model, fsdp_opt), (ddp_model, ddp_opt)):
/external/pytorch/benchmarks/distributed/rpc/parameter_server/trainer/
Dddp_models.py18 ddp_model = DDP(model, device_ids=[rank], process_group=process_group)
20 ddp_model.register_comm_hook(hook_state, hook)
21 return ddp_model, hook_state
Diteration_steps.py2 self, ddp_model, criterion, optimizer, hook_state, epoch, index, batch argument
19 loss = criterion(ddp_model(batch[0]), batch[1])
Dtrainer.py227 ddp_model, hook_state = self.create_ddp_model(
230 optimizer = torch.optim.SGD(ddp_model.parameters(), 1e-4)
238 ddp_model,
/external/pytorch/test/distributed/optim/
Dtest_zero_redundancy_optimizer.py886 ddp_model = DDP(
896 ddp_model,
905 ddp_loss = ddp_model(input_tensor).abs().sum()
930 ddp_model,
940 next(ddp_model.parameters()).requires_grad = bool(i % 2)
997 ddp_model = DDP(model, device_ids=[rank]) if is_gpu else DDP(model)
998 local_optim = torch.optim.Adam(ddp_model.parameters(), lr=LR)
1016 with ddp_model.join():
1019 output = ddp_model(input)
1023 for p in ddp_model.parameters():
[all …]
/external/pytorch/test/distributed/
Dtest_c10d_common.py343 ddp_model = DistributedDataParallel(
356 return model, ddp_model, input, target
375 ddp_model = DistributedDataParallel(
386 return model, ddp_model, input, target
420 ddp_model = copy.deepcopy(input_model).cuda()
421 ddp_model = nn.parallel.DistributedDataParallel(
422 ddp_model,
431 ddp_model._get_ddp_logging_data().get("static_graph", 0), static_graph
438 ddp_model.zero_grad(set_to_none=False)
448 ddp_model,
[all …]
Dtest_c10d_ucc.py639 def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): argument
652 criterion(ddp_model(partial_input), partial_target).backward()
656 ddp_parameter = next(ddp_model.parameters())
774 ddp_model = DistributedDataParallel(
780 self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
957 ddp_model = DistributedDataParallel(
973 ddp_model.register_comm_hook(None, allreduce_hook_ucc)
975 self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
Dtest_c10d_nccl.py987 ddp_model = DistributedDataParallel(
994 ddp_model = DistributedDataParallel(
1002 ddp_model = DistributedDataParallel(model, process_group=process_group)
1008 ddp_model = DistributedDataParallel(
1018 ddp_model = DistributedDataParallel(
1032 ddp_model.train()
1033 output = ddp_model(input)
1037 self.assertFalse(any(torch.isinf(p.grad).any() for p in ddp_model.parameters()))
1198 ddp_model = None
1218 nonlocal ddp_model
[all …]
Dtest_c10d_gloo.py1814 def _run_and_verify_sparse_gradients(self, vanilla_model, ddp_model): argument
1827 criterion(ddp_model(partial_input), partial_target).backward()
1831 ddp_parameter = next(ddp_model.parameters())
1949 ddp_model = DistributedDataParallel(
1955 self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
2127 ddp_model = DistributedDataParallel(
2143 ddp_model.register_comm_hook(None, allreduce_hook_gloo)
2145 self._run_and_verify_sparse_gradients(vanilla_model, ddp_model)
Dtest_dynamo_distributed.py788 ddp_model = DDP(model, device_ids=[self.rank])
791 optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
804 output = ddp_model(data)
/external/pytorch/docs/source/
Dddp_comm_hooks.rst170 ddp_model = DistributedDataParallel(model, device_ids=[rank])
175 optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
176 ddp_model.register_comm_hook(powersgd_state, powersgd_hook)
179 'state_dict': ddp_model.state_dict(),
Ddistributed.rst690 ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
696 output = ddp_model(inp)
/external/pytorch/test/distributed/checkpoint/
Dtest_state_dict.py555 ddp_model = DDP(copy.deepcopy(model))
556 set_model_state_dict(ddp_model, get_model_state_dict(ddp_model))
558 self.assertEqual(model.state_dict(), get_model_state_dict(ddp_model))
567 ddp_model = DDP(copy.deepcopy(model))
568 set_model_state_dict(ddp_model, get_model_state_dict(ddp_model))
569 self.assertEqual(model.state_dict(), get_model_state_dict(ddp_model))
/external/pytorch/docs/source/notes/
Dddp.rst41 ddp_model = DDP(model, device_ids=[rank])
44 optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
47 outputs = ddp_model(torch.randn(20, 10).to(rank))
76 ddp_model = DDP(model, device_ids=[rank])
77 ddp_model = torch.compile(ddp_model)
/external/pytorch/torch/testing/_internal/distributed/
Dddp_under_dist_autograd_test.py595 ddp_model = DistributedDataParallel(model)
602 loss = ddp_model(input, offsets).sum()
606 loss = ddp_model(input, offsets).sum()
640 ddp_model = DistributedDataParallel(layer2)
641 loss = ddp_model(layer1(inputs)).sum()
646 loss = ddp_model(remote_layer1(inputs)).sum()
Ddistributed_test.py4423 ddp_model = nn.parallel.DistributedDataParallel(
4438 ddp_model = torch.nn.parallel.DistributedDataParallel(model)
4443 ddp_model().backward(create_graph=True)
4446 all(param.requires_grad for param in ddp_model.parameters())
4508 ddp_model = torch.nn.parallel.DistributedDataParallel(
4512 ddp_logging_data = ddp_model._get_ddp_logging_data()
4515 ddp_model.register_comm_hook(None, hook)
4516 ddp_logging_data = ddp_model._get_ddp_logging_data()
4520 ddp_model = torch.nn.parallel.DistributedDataParallel(
4524 ddp_logging_data = ddp_model._get_ddp_logging_data()
[all …]
/external/pytorch/test/dynamo/
Dtest_structured_trace.py365 ddp_model = torch._dynamo.optimize("inductor")(
369 ddp_model(torch.randn(1024, 1024, device="cuda:0"))
Dtest_logging.py230 ddp_model = torch._dynamo.optimize("inductor")(
234 ddp_model(torch.randn(1024, 1024, device="cuda:0"))
/external/pytorch/torch/testing/_internal/
Dcommon_fsdp.py583 ddp_model = NonUniformReqGradNWM(
589 NonUniformReqGradNWM._set_nonuniform_req_grad(ddp_model, req_grad_pattern)
590 return ddp_model