/external/tensorflow/tensorflow/compiler/xla/service/ |
D | all_reduce_simplifier.cc | 41 [this](const HloInstruction* all_reduce) -> int64_t { in Run() argument 42 if (all_reduce->replica_groups().empty()) { in Run() 46 for (const auto& group : all_reduce->replica_groups()) { in Run() 94 auto all_reduce = all_reduce_and_group_size.first; in Run() local 97 TF_RETURN_IF_ERROR(all_reduce->parent()->ReplaceInstruction( in Run() 98 all_reduce, all_reduce->mutable_operand(0))); in Run() 102 if (all_reduce->to_apply()->instruction_count() != 3 || in Run() 103 all_reduce->to_apply()->num_parameters() != 2) { in Run() 107 switch (all_reduce->to_apply()->root_instruction()->opcode()) { in Run() 112 all_reduce->parent()->AddInstruction(HloInstruction::CreateConstant( in Run() [all …]
|
D | all_reduce_contiguous.cc | 31 Status ReplaceWithContiguousAllReduce(HloAllReduceInstruction* all_reduce) { in ReplaceWithContiguousAllReduce() argument 32 TF_RET_CHECK(all_reduce); in ReplaceWithContiguousAllReduce() 33 TF_RET_CHECK(!all_reduce->has_sharding()); in ReplaceWithContiguousAllReduce() 35 HloComputation& computation = *all_reduce->parent(); // never null in ReplaceWithContiguousAllReduce() 36 PrimitiveType element_type = all_reduce->operand(0)->shape().element_type(); in ReplaceWithContiguousAllReduce() 40 flat_operands.reserve(all_reduce->operand_count()); in ReplaceWithContiguousAllReduce() 42 for (HloInstruction* operand : all_reduce->operands()) { in ReplaceWithContiguousAllReduce() 58 concat_shape, {concatenated}, all_reduce->to_apply(), in ReplaceWithContiguousAllReduce() 59 all_reduce->replica_groups(), in ReplaceWithContiguousAllReduce() 60 /*constrain_layout=*/false, all_reduce->channel_id(), in ReplaceWithContiguousAllReduce() [all …]
|
D | while_loop_all_reduce_code_motion.cc | 159 HloInstruction* all_reduce, HloComputation* while_body, in IsAllReduceMovable() argument 164 VLOG(4) << "IsAllReduceMovable: " << all_reduce->ToString(); in IsAllReduceMovable() 166 GetCollectiveOpGroupMode(all_reduce->channel_id().has_value(), in IsAllReduceMovable() 167 DynCast<HloAllReduceInstruction>(all_reduce) in IsAllReduceMovable() 170 auto all_reduce_is_summation = [](HloInstruction* all_reduce) -> bool { in IsAllReduceMovable() argument 172 MatchReductionComputation(all_reduce->to_apply()); in IsAllReduceMovable() 179 all_reduce](const HloInstruction& instruction, in IsAllReduceMovable() 183 all_reduce->replica_groups(), in IsAllReduceMovable() 184 all_reduce->GetModule()->config().replica_count(), in IsAllReduceMovable() 185 all_reduce->GetModule()->config().num_partitions(), in IsAllReduceMovable() [all …]
|
D | ar_crs_combiner.cc | 95 auto all_reduce = Cast<HloAllReduceInstruction>(hlo); in HasCombinableReplicaGroup() local 96 auto replica_groups = all_reduce->replica_groups(); in HasCombinableReplicaGroup() 98 CHECK(all_reduce->IsCrossModuleAllReduce()); in HasCombinableReplicaGroup() 100 if (all_reduce->use_global_device_ids()) { in HasCombinableReplicaGroup() 544 auto all_reduce = pair.ar; in RewriteGraph() local 545 auto parent_computation = all_reduce->parent(); in RewriteGraph() 546 auto channel_id = all_reduce->channel_id(); in RewriteGraph() 547 auto prev = all_reduce->mutable_operand(0); in RewriteGraph() 548 auto next = all_reduce->users()[0]; in RewriteGraph() 549 TF_CHECK_OK(all_reduce->ReplaceUseWith(next, prev)); in RewriteGraph() [all …]
|
/external/tensorflow/tensorflow/compiler/mlir/xla/transforms/ |
D | legalize_tf_collective.cc | 142 auto all_reduce = builder.create<AllReduceOp>( in ConvertAllReduce() local 145 BuildReduceBody<AddOp>(element_type, &all_reduce.computation(), &builder); in ConvertAllReduce() 147 BuildReduceBody<MulOp>(element_type, &all_reduce.computation(), &builder); in ConvertAllReduce() 149 BuildReduceBody<MinOp>(element_type, &all_reduce.computation(), &builder); in ConvertAllReduce() 151 BuildReduceBody<MaxOp>(element_type, &all_reduce.computation(), &builder); in ConvertAllReduce() 157 Operation* result = all_reduce; in ConvertAllReduce() 169 loc, all_reduce.getResult(), divisor.getResult(), broadcast_dims); in ConvertAllReduce() 199 LogicalResult matchAndRewrite(TF::XlaAllReduceOp all_reduce, in matchAndRewrite() argument 202 if (failed(ConvertReplicaGroups(rewriter, all_reduce.group_assignment(), in matchAndRewrite() 203 replica_groups, all_reduce))) { in matchAndRewrite() [all …]
|
/external/tensorflow/tensorflow/compiler/xla/service/gpu/ |
D | all_reduce_blueconnect.cc | 111 const HloAllReduceInstruction& all_reduce, size_t num_devices_per_host) { in TryDecomposeReplicaGroups() argument 113 all_reduce.parent()->parent()->config().static_device_assignment(); in TryDecomposeReplicaGroups() 115 absl::Span<const ReplicaGroup> replica_groups = all_reduce.replica_groups(); in TryDecomposeReplicaGroups() 142 for (const HloInstruction* operand : all_reduce.operands()) { in TryDecomposeReplicaGroups() 186 StatusOr<bool> TryDecomposeAllReduce(HloAllReduceInstruction* all_reduce, in TryDecomposeAllReduce() argument 188 TF_RET_CHECK(all_reduce); in TryDecomposeAllReduce() 189 TF_RET_CHECK(!all_reduce->has_sharding()); in TryDecomposeAllReduce() 191 HloComputation& computation = *all_reduce->parent(); // never null in TryDecomposeAllReduce() 192 PrimitiveType element_type = all_reduce->operand(0)->shape().element_type(); in TryDecomposeAllReduce() 196 TryDecomposeReplicaGroups(*all_reduce, num_devices_per_host)); in TryDecomposeAllReduce() [all …]
|
D | nvptx_compiler_test.cc | 53 HloInstruction* all_reduce = module->entry_computation()->root_instruction(); in TEST_F() local 54 EXPECT_TRUE(buffer_assignment->SharesTopLevelSlice(all_reduce, in TEST_F() 55 all_reduce->operand(0))); in TEST_F() 83 HloInstruction* all_reduce = module->entry_computation()->root_instruction(); in TEST_F() local 85 all_reduce, {0}, all_reduce->operand(0), {})); in TEST_F() 87 all_reduce, {1}, all_reduce->operand(1), {})); in TEST_F()
|
D | all_reduce_blueconnect_test.cc | 75 auto all_reduce = AllOf(op::Shape("f32[4]"), op::AllReduce(reduce_scatter), in TEST_F() local 77 auto all_gather = AllOf(op::Shape("f32[16]"), op::AllGather(all_reduce), in TEST_F() 117 auto all_reduce = AllOf(op::Shape("f32[2]"), op::AllReduce(reduce_scatter1), in TEST_F() local 119 auto all_gather0 = AllOf(op::Shape("f32[4]"), op::AllGather(all_reduce), in TEST_F() 162 auto all_reduce = AllOf(op::Shape("(f32[4], f32[8])"), in TEST_F() local 167 op::AllGather(op::GetTupleElement(all_reduce, 0), in TEST_F() 168 op::GetTupleElement(all_reduce, 1)), in TEST_F()
|
/external/tensorflow/tensorflow/compiler/xla/tests/ |
D | all_reduce_test.cc | 124 …(f32[] %p0.1), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 125 … %convert.11), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 126 ….12), index=0, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 127 ….12), index=1, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 128 …omputation.15, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 129 ….19), index=1, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 130 …e-element.21), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 131 ….19), index=0, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 166 …(f32[] %p0.1), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() 167 … %convert.11), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F() [all …]
|
/external/tensorflow/tensorflow/dtensor/mlir/ |
D | dtensor_allreduce_scatter_optimization.cc | 79 function.walk([&](mlir::TF::DTensorAllReduceOp all_reduce) { in ApplyOptimization() argument 80 if (all_reduce->hasOneUse()) { in ApplyOptimization() 82 *all_reduce->getUsers().begin())) { in ApplyOptimization() 84 if (VLOG_IS_ON(2)) all_reduce.dump(); in ApplyOptimization() 112 if (!matchPattern(all_reduce.group_assignment(), in ApplyOptimization() 114 all_reduce.emitOpError("group_assignment should be a constant"); in ApplyOptimization() 131 mlir::OpBuilder builder(all_reduce); in ApplyOptimization() 133 all_reduce.getLoc(), in ApplyOptimization() 139 all_reduce.getLoc(), all_scatter->getResultTypes(), in ApplyOptimization() 140 all_reduce.getOperand(0), all_reduce.group_assignment(), in ApplyOptimization() [all …]
|
D | dtensor_allreduce_combine_optimization.cc | 158 for (mlir::TF::DTensorAllReduceOp& all_reduce : all_reduce_group) { in MergeAllReduceGroup() 160 all_reduce.getType().dyn_cast<mlir::RankedTensorType>(); in MergeAllReduceGroup() 162 return all_reduce.emitOpError(llvm::formatv( in MergeAllReduceGroup() 197 mlir::TF::DTensorAllReduceOp& all_reduce = all_reduce_group[i]; in MergeAllReduceGroup() local 198 mlir::Location loc = all_reduce.getLoc(); in MergeAllReduceGroup() 200 all_reduce.getType().dyn_cast<mlir::RankedTensorType>(); in MergeAllReduceGroup() 202 return all_reduce.emitOpError(llvm::formatv( in MergeAllReduceGroup() 209 loc, all_reduce.input(), in MergeAllReduceGroup() 234 mlir::TF::DTensorAllReduceOp& all_reduce = all_reduce_group[i]; in MergeAllReduceGroup() local 235 mlir::Location loc = all_reduce.getLoc(); in MergeAllReduceGroup() [all …]
|
D | dtensor_allreduce_sum_optimization.cc | 154 mlir::TF::DTensorAllReduceOp all_reduce = in OptimizeAllReduceAndSum() local 172 SetSingleLayoutOnOp(all_reduce, layout_or_status->value()); in OptimizeAllReduceAndSum() 177 all_reduce.output(), in OptimizeAllReduceAndSum() 178 llvm::SmallPtrSet<mlir::Operation*, 1>{all_reduce.getOperation()}); in OptimizeAllReduceAndSum() 290 mlir::TF::DTensorAllReduceOp all_reduce = in CheckWhileLoopOptimizationCriteria() local 293 if (all_reduce) { in CheckWhileLoopOptimizationCriteria() 297 all_reduce = llvm::dyn_cast_or_null<mlir::TF::DTensorAllReduceOp>( in CheckWhileLoopOptimizationCriteria() 302 if (!block_arg || !all_reduce) return false; in CheckWhileLoopOptimizationCriteria() 307 all_reduce.group_assignment().getDefiningOp(); in CheckWhileLoopOptimizationCriteria() 311 if (all_reduce.reduce_op().str() != kReduceOpAdd) return false; in CheckWhileLoopOptimizationCriteria() [all …]
|
/external/pytorch/test/distributed/_tensor/debug/ |
D | test_comm_mode_features.py | 205 c10d_functional.all_reduce 211 c10d_functional.all_reduce 217 c10d_functional.all_reduce 272 c10d_functional.all_reduce 284 c10d_functional.all_reduce 296 c10d_functional.all_reduce 302 c10d_functional.all_reduce 308 c10d_functional.all_reduce 314 c10d_functional.all_reduce 321 ][c10d_functional.all_reduce], [all …]
|
/external/tensorflow/tensorflow/dtensor/mlir/utils/ |
D | collective_lowering.cc | 91 mlir::TF::DTensorAllReduceOp all_reduce, in EmitAllReduceForXla() argument 96 all_reduce.getLoc(), all_reduce.getResult().getType(), all_reduce.input(), in EmitAllReduceForXla() 97 all_reduce.group_assignment(), all_reduce.reduce_opAttr(), in EmitAllReduceForXla() 205 mlir::TF::DTensorAllReduceOp all_reduce, mlir::Value* value) { in LowerAllReduceOpImpl() argument 206 mlir::Location loc = all_reduce.getLoc(); in LowerAllReduceOpImpl() 208 ExtractRequiredSingleLayoutFromOp(all_reduce); in LowerAllReduceOpImpl() 210 return all_reduce.emitOpError(output_layout.status().error_message()); in LowerAllReduceOpImpl() 213 if (!matchPattern(all_reduce.group_assignment(), in LowerAllReduceOpImpl() 221 const bool is_tpu = all_reduce.device_type().endswith("TPU"); in LowerAllReduceOpImpl() 227 if (mlir::failed(EmitAllReduceForXla(context, builder, all_reduce, in LowerAllReduceOpImpl() [all …]
|
/external/tensorflow/tensorflow/python/ops/ |
D | collective_ops_gpu_test.py | 69 collectives.append(collective_ops.all_reduce( 88 collective_ops.all_reduce( 111 collectives.append(collective_ops.all_reduce( 133 collectives.append(collective_ops.all_reduce( 235 collective_ops.all_reduce(in0, self._group_size, group_key, 239 collective_ops.all_reduce(in1, self._group_size, group_key, 255 c0 = collective_ops.all_reduce( 260 c1 = collective_ops.all_reduce( 279 collective_ops.all_reduce( 296 collective_ops.all_reduce( [all …]
|
/external/pytorch/test/distributed/tensor/parallel/ |
D | test_tp_examples.py | 47 reduce_scatter, all_gather, all_reduce = ( variable 50 c10d_functional.all_reduce, 135 self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 1) 140 dist.all_reduce(model.net1.weight.grad) 141 dist.all_reduce(model.net1.bias.grad) 142 dist.all_reduce(model.net2.weight.grad) 143 dist.all_reduce(model.net2.bias.grad) 286 fwd={all_reduce: 6, all_gather: 1}, bwd={all_reduce: 9} 291 optim={all_reduce: 30}, 345 bwd={reduce_scatter: 5, all_gather: 6}, optim={all_reduce: 30} [all …]
|
/external/tensorflow/tensorflow/python/kernel_tests/ |
D | collective_ops_multi_worker_test.py | 109 collective_ops.all_reduce( 182 collective_ops.all_reduce( 200 collective_ops.all_reduce( 210 collective_ops.all_reduce( 222 collective_ops.all_reduce( 253 collective_ops.all_reduce(in_tensor, group_size, group_key, 259 collective_ops.all_reduce(in_tensor, group_size, group_key, 267 collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key) 282 collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key)
|
D | collective_ops_test.py | 47 def all_reduce(t, group_size, group_key, instance_key, *args, **kwargs): member in CollectiveOpsV1 49 return _collective_ops.all_reduce(t, group_size, group_key, instance_key, 65 def all_reduce(t, group_size, group_key, instance_key, *args, **kwargs): member in CollectiveOpsV2 107 combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce), 109 combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce), 144 return collective_ops.all_reduce( 161 collective_ops.all_reduce( 170 collective_ops.all_reduce( 329 collective_ops.all_reduce( 338 collective_ops.all_reduce( [all …]
|
/external/tensorflow/tensorflow/compiler/mlir/xla/tests/ |
D | legalize-tf-collective.mlir | 10 // CHECK: "mhlo.all_reduce" 26 // CHECK: "mhlo.all_reduce" 32 // CHECK: "mhlo.all_reduce" 47 // CHECK: "mhlo.all_reduce" 56 // CHECK: "mhlo.all_reduce" 66 // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce" 77 // CHECK: "mhlo.all_reduce" 86 // CHECK: "mhlo.all_reduce" 103 // CHECK: "mhlo.all_reduce" 109 // CHECK: "mhlo.all_reduce" [all …]
|
/external/pytorch/test/distributed/ |
D | test_multi_threaded_pg.py | 199 dist.all_reduce(output) 220 dist.all_reduce(tensor, op=ReduceOp.PRODUCT) 225 dist.all_reduce(tensor, op=ReduceOp.MIN) 229 dist.all_reduce(tensor, op=ReduceOp.MAX) 233 dist.all_reduce(tensor, op=ReduceOp.BAND) 238 dist.all_reduce(tensor, op=ReduceOp.BOR) 243 dist.all_reduce(tensor, op=ReduceOp.BXOR) 264 dist.all_reduce(output, group=subpg0) 266 dist.all_reduce(output, group=subpg1) 276 dist.all_reduce(x, group=pg) [all …]
|
D | test_compute_comm_reordering.py | 110 ar = _functional_collectives.all_reduce(a, "sum", "0") 152 e = _functional_collectives.all_reduce(b, "sum", "0") 199 e = _functional_collectives.all_reduce(b, "sum", "0") 248 ar = _functional_collectives.all_reduce(a, "sum", ranks, tag) 253 fr = _functional_collectives.all_reduce(f, "sum", ranks, tag) 305 ar = _functional_collectives.all_reduce(a, "sum", ranks, tag) 310 fr = _functional_collectives.all_reduce(f, "sum", ranks, tag) 355 ar = _functional_collectives.all_reduce(div, "sum", ranks, tag)
|
D | test_inductor_collectives.py | 104 ar = torch.ops.c10d_functional.all_reduce(z, "sum", tag, ranks, group_size) 138 dist.all_reduce(y, op=dist.ReduceOp.SUM) 144 y = dist.all_reduce(y, op=dist.ReduceOp.SUM) 167 op = torch.ops._c10d_functional.all_reduce.default 169 op = torch.ops.c10d_functional.all_reduce.default 179 ar = torch.ops.c10d_functional.all_reduce(z, "sum", tag, ranks, group_size) 218 ar = torch.ops.c10d_functional.all_reduce(z, "sum", tag, ranks, group_size) 251 ar = _functional_collectives.all_reduce(a, "sum", ranks, tag) 525 ar = torch.ops.c10d_functional.all_reduce( 560 ar = torch.ops.c10d_functional.all_reduce(x, "sum", tag, ranks, group_size) [all …]
|
/external/pytorch/torch/distributed/algorithms/ddp_comm_hooks/ |
D | default_hooks.py | 27 dist.all_reduce(tensor, group=group_to_use, async_op=True) 91 grad = dist._functional_collectives.all_reduce( 96 fut = dist.all_reduce( 140 grad = dist._functional_collectives.all_reduce( 145 fut = dist.all_reduce(
|
/external/tensorflow/third_party/nccl/ |
D | archive.patch | 5 diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu.cc 7 rename from src/collectives/device/all_reduce.cu 8 rename to src/collectives/device/all_reduce.cu.cc
|
/external/pytorch/torch/distributed/algorithms/ |
D | join.py | 284 dist.all_reduce(num_nonjoined_procs, group=self._process_group) 293 dist.all_reduce(ones, group=self._process_group) 337 work = dist.all_reduce(ones, group=process_group, async_op=True) 342 dist.all_reduce(zeros, group=process_group)
|