/external/pytorch/torch/distributed/ |
D | _functional_collectives.py | 446 output_split_sizes: Optional[List[int]], 466 if output_split_sizes is not None: 468 isinstance(size, (int, torch.SymInt)) for size in output_split_sizes 469 ), output_split_sizes 476 if output_split_sizes is None or input_split_sizes is None: 477 assert output_split_sizes is None and input_split_sizes is None, ( 481 output_split_sizes = [self.shape[0] // group_size] * group_size 482 input_split_sizes = output_split_sizes 485 output_split_sizes, 494 output_split_sizes: Optional[List[int]], [all …]
|
D | _functional_collectives_impl.py | 93 output_split_sizes: Optional[List[int]], 99 if output_split_sizes is None or input_split_sizes is None: 100 assert output_split_sizes is None and input_split_sizes is None, ( 104 output_split_sizes = [input.shape[0] // group_size] * group_size 105 input_split_sizes = output_split_sizes 110 output_split_sizes,
|
D | distributed_c10d.py | 3883 output_split_sizes=None, argument 3992 output_split_sizes = [] if output_split_sizes is None else output_split_sizes 3997 output, input, output_split_sizes, input_split_sizes, opts
|
/external/pytorch/torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/ |
D | _common.py | 190 output_split_sizes = [0] * world_size 192 output_split_sizes[placement.rank()] = get_chunked_dim_size( 198 output, combined_results, output_split_sizes=output_split_sizes, group=pg 211 dim_size = output_split_sizes[placement.rank()] 214 for i, split_size in enumerate(output_split_sizes)
|
/external/pytorch/torch/distributed/nn/ |
D | functional.py | 177 output_split_sizes=None, argument 201 group, output, output_split_sizes, input_split_sizes, input 411 def forward(ctx, group, output, output_split_sizes, input_split_sizes, input): argument 414 ctx.output_split_sizes = input_split_sizes 415 ctx.input_split_sizes = output_split_sizes 419 output_split_sizes=output_split_sizes, 434 ctx.output_split_sizes,
|
/external/pytorch/torch/csrc/distributed/c10d/ |
D | Functional.cpp | 259 std::vector<int64_t> output_split_sizes, in all_to_all_single() argument 265 output_split_sizes.begin(), output_split_sizes.end(), int64_t(0)); in all_to_all_single() 273 output_split_sizes, in all_to_all_single() 403 std::vector<int64_t> output_split_sizes, in forward() argument 410 ctx->saved_data["input_split_sizes"] = output_split_sizes; in forward() 416 .call(input, output_split_sizes, input_split_sizes, group_name); in forward() 422 const std::vector<int64_t>& output_split_sizes = in backward() local 435 .call(grad_out, output_split_sizes, input_split_sizes, group_name); in backward() 450 const std::vector<int64_t>& output_split_sizes, in all_to_all_single_autograd() argument 454 input, output_split_sizes, input_split_sizes, group_name); in all_to_all_single_autograd()
|
D | Ops.cpp | 414 std::vector<int64_t> output_split_sizes, \ 421 output_split_sizes, \
|
/external/pytorch/torch/distributed/_shard/sharded_tensor/ |
D | reshard.py | 128 output_split_sizes = [0] * world_size 131 output_split_sizes[new_rank] = sharded_dim_size 144 output_split_sizes=output_split_sizes,
|
/external/pytorch/test/distributed/ |
D | test_c10d_functional_native.py | 356 output_split_sizes = send_sz_matrix[:, self.rank].tolist() 361 output_split_sizes, 369 for rank, sz in enumerate(output_split_sizes) 376 input, output_split_sizes, input_split_sizes, "default" 780 output_split_sizes: torch.Tensor, 785 _tolist_with_constrain_as_size(output_split_sizes), 795 output_split_sizes = send_sz_matrix[:, self.rank].contiguous() 805 compiled, input, output_split_sizes, input_split_sizes
|
D | test_functional_api.py | 525 x, output_split_sizes=split_sizes, input_split_sizes=split_sizes, group=mesh 543 x, output_split_sizes=split_sizes, input_split_sizes=split_sizes, group=mesh 559 x, output_split_sizes=None, input_split_sizes=None, group=mesh
|
D | test_c10d_spawn.py | 242 y, x, output_split_sizes=split_sizes, input_split_sizes=split_sizes
|
D | test_inductor_collectives.py | 410 output_split_sizes = _tolist_with_constrain_as_size( 415 output_split_sizes,
|
/external/pytorch/torch/testing/_internal/distributed/ |
D | multi_threaded_pg.py | 84 output_buffer, _, output_split_sizes, _ = data[dest_rank] 86 … output_indexes = self._size_cumsum(output_buffer.size(0), output_split_sizes, world_size) 321 output_split_sizes: Optional[List[int]], 326 … res = coll.join(self._rank, (output_buffer, input_buffer, output_split_sizes, input_split_sizes))
|
D | distributed_test.py | 3514 output_split_sizes = [] 3516 output_split_sizes.append(dst + 1) 3517 sum_len = sum(output_split_sizes) 3523 output_split_sizes[rank], sum_len, sum_len, dtype=torch.float 3531 list(torch.split(out_tensor, output_split_sizes)),
|
/external/pytorch/torch/_C/ |
D | _distributed_c10d.pyi | 462 output_split_sizes: list[int], 471 output_split_sizes: list[int],
|
/external/pytorch/torch/_inductor/ |
D | lowering.py | 6412 def _all_to_all_single(inp, output_split_sizes, input_split_sizes, group_name): argument 6417 output_split_sizes,
|