Home
last modified time | relevance | path

Searched refs:all_reduce (Results 1 – 25 of 150) sorted by relevance

123456

/external/tensorflow/tensorflow/compiler/xla/service/
Dall_reduce_simplifier.cc41 [this](const HloInstruction* all_reduce) -> int64_t { in Run() argument
42 if (all_reduce->replica_groups().empty()) { in Run()
46 for (const auto& group : all_reduce->replica_groups()) { in Run()
94 auto all_reduce = all_reduce_and_group_size.first; in Run() local
97 TF_RETURN_IF_ERROR(all_reduce->parent()->ReplaceInstruction( in Run()
98 all_reduce, all_reduce->mutable_operand(0))); in Run()
102 if (all_reduce->to_apply()->instruction_count() != 3 || in Run()
103 all_reduce->to_apply()->num_parameters() != 2) { in Run()
107 switch (all_reduce->to_apply()->root_instruction()->opcode()) { in Run()
112 all_reduce->parent()->AddInstruction(HloInstruction::CreateConstant( in Run()
[all …]
Dall_reduce_contiguous.cc31 Status ReplaceWithContiguousAllReduce(HloAllReduceInstruction* all_reduce) { in ReplaceWithContiguousAllReduce() argument
32 TF_RET_CHECK(all_reduce); in ReplaceWithContiguousAllReduce()
33 TF_RET_CHECK(!all_reduce->has_sharding()); in ReplaceWithContiguousAllReduce()
35 HloComputation& computation = *all_reduce->parent(); // never null in ReplaceWithContiguousAllReduce()
36 PrimitiveType element_type = all_reduce->operand(0)->shape().element_type(); in ReplaceWithContiguousAllReduce()
40 flat_operands.reserve(all_reduce->operand_count()); in ReplaceWithContiguousAllReduce()
42 for (HloInstruction* operand : all_reduce->operands()) { in ReplaceWithContiguousAllReduce()
58 concat_shape, {concatenated}, all_reduce->to_apply(), in ReplaceWithContiguousAllReduce()
59 all_reduce->replica_groups(), in ReplaceWithContiguousAllReduce()
60 /*constrain_layout=*/false, all_reduce->channel_id(), in ReplaceWithContiguousAllReduce()
[all …]
Dwhile_loop_all_reduce_code_motion.cc159 HloInstruction* all_reduce, HloComputation* while_body, in IsAllReduceMovable() argument
164 VLOG(4) << "IsAllReduceMovable: " << all_reduce->ToString(); in IsAllReduceMovable()
166 GetCollectiveOpGroupMode(all_reduce->channel_id().has_value(), in IsAllReduceMovable()
167 DynCast<HloAllReduceInstruction>(all_reduce) in IsAllReduceMovable()
170 auto all_reduce_is_summation = [](HloInstruction* all_reduce) -> bool { in IsAllReduceMovable() argument
172 MatchReductionComputation(all_reduce->to_apply()); in IsAllReduceMovable()
179 all_reduce](const HloInstruction& instruction, in IsAllReduceMovable()
183 all_reduce->replica_groups(), in IsAllReduceMovable()
184 all_reduce->GetModule()->config().replica_count(), in IsAllReduceMovable()
185 all_reduce->GetModule()->config().num_partitions(), in IsAllReduceMovable()
[all …]
Dar_crs_combiner.cc95 auto all_reduce = Cast<HloAllReduceInstruction>(hlo); in HasCombinableReplicaGroup() local
96 auto replica_groups = all_reduce->replica_groups(); in HasCombinableReplicaGroup()
98 CHECK(all_reduce->IsCrossModuleAllReduce()); in HasCombinableReplicaGroup()
100 if (all_reduce->use_global_device_ids()) { in HasCombinableReplicaGroup()
544 auto all_reduce = pair.ar; in RewriteGraph() local
545 auto parent_computation = all_reduce->parent(); in RewriteGraph()
546 auto channel_id = all_reduce->channel_id(); in RewriteGraph()
547 auto prev = all_reduce->mutable_operand(0); in RewriteGraph()
548 auto next = all_reduce->users()[0]; in RewriteGraph()
549 TF_CHECK_OK(all_reduce->ReplaceUseWith(next, prev)); in RewriteGraph()
[all …]
/external/tensorflow/tensorflow/compiler/mlir/xla/transforms/
Dlegalize_tf_collective.cc142 auto all_reduce = builder.create<AllReduceOp>( in ConvertAllReduce() local
145 BuildReduceBody<AddOp>(element_type, &all_reduce.computation(), &builder); in ConvertAllReduce()
147 BuildReduceBody<MulOp>(element_type, &all_reduce.computation(), &builder); in ConvertAllReduce()
149 BuildReduceBody<MinOp>(element_type, &all_reduce.computation(), &builder); in ConvertAllReduce()
151 BuildReduceBody<MaxOp>(element_type, &all_reduce.computation(), &builder); in ConvertAllReduce()
157 Operation* result = all_reduce; in ConvertAllReduce()
169 loc, all_reduce.getResult(), divisor.getResult(), broadcast_dims); in ConvertAllReduce()
199 LogicalResult matchAndRewrite(TF::XlaAllReduceOp all_reduce, in matchAndRewrite() argument
202 if (failed(ConvertReplicaGroups(rewriter, all_reduce.group_assignment(), in matchAndRewrite()
203 replica_groups, all_reduce))) { in matchAndRewrite()
[all …]
/external/tensorflow/tensorflow/compiler/xla/service/gpu/
Dall_reduce_blueconnect.cc111 const HloAllReduceInstruction& all_reduce, size_t num_devices_per_host) { in TryDecomposeReplicaGroups() argument
113 all_reduce.parent()->parent()->config().static_device_assignment(); in TryDecomposeReplicaGroups()
115 absl::Span<const ReplicaGroup> replica_groups = all_reduce.replica_groups(); in TryDecomposeReplicaGroups()
142 for (const HloInstruction* operand : all_reduce.operands()) { in TryDecomposeReplicaGroups()
186 StatusOr<bool> TryDecomposeAllReduce(HloAllReduceInstruction* all_reduce, in TryDecomposeAllReduce() argument
188 TF_RET_CHECK(all_reduce); in TryDecomposeAllReduce()
189 TF_RET_CHECK(!all_reduce->has_sharding()); in TryDecomposeAllReduce()
191 HloComputation& computation = *all_reduce->parent(); // never null in TryDecomposeAllReduce()
192 PrimitiveType element_type = all_reduce->operand(0)->shape().element_type(); in TryDecomposeAllReduce()
196 TryDecomposeReplicaGroups(*all_reduce, num_devices_per_host)); in TryDecomposeAllReduce()
[all …]
Dnvptx_compiler_test.cc53 HloInstruction* all_reduce = module->entry_computation()->root_instruction(); in TEST_F() local
54 EXPECT_TRUE(buffer_assignment->SharesTopLevelSlice(all_reduce, in TEST_F()
55 all_reduce->operand(0))); in TEST_F()
83 HloInstruction* all_reduce = module->entry_computation()->root_instruction(); in TEST_F() local
85 all_reduce, {0}, all_reduce->operand(0), {})); in TEST_F()
87 all_reduce, {1}, all_reduce->operand(1), {})); in TEST_F()
Dall_reduce_blueconnect_test.cc75 auto all_reduce = AllOf(op::Shape("f32[4]"), op::AllReduce(reduce_scatter), in TEST_F() local
77 auto all_gather = AllOf(op::Shape("f32[16]"), op::AllGather(all_reduce), in TEST_F()
117 auto all_reduce = AllOf(op::Shape("f32[2]"), op::AllReduce(reduce_scatter1), in TEST_F() local
119 auto all_gather0 = AllOf(op::Shape("f32[4]"), op::AllGather(all_reduce), in TEST_F()
162 auto all_reduce = AllOf(op::Shape("(f32[4], f32[8])"), in TEST_F() local
167 op::AllGather(op::GetTupleElement(all_reduce, 0), in TEST_F()
168 op::GetTupleElement(all_reduce, 1)), in TEST_F()
/external/tensorflow/tensorflow/compiler/xla/tests/
Dall_reduce_test.cc124 …(f32[] %p0.1), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
125 … %convert.11), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
126 ….12), index=0, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
127 ….12), index=1, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
128 …omputation.15, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
129 ….19), index=1, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
130 …e-element.21), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
131 ….19), index=0, metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
166 …(f32[] %p0.1), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
167 … %convert.11), metadata={op_type="xla::cross_replica_sum" source_file="all_reduce@xla_model.py" so… in XLA_TEST_F()
[all …]
/external/tensorflow/tensorflow/dtensor/mlir/
Ddtensor_allreduce_scatter_optimization.cc79 function.walk([&](mlir::TF::DTensorAllReduceOp all_reduce) { in ApplyOptimization() argument
80 if (all_reduce->hasOneUse()) { in ApplyOptimization()
82 *all_reduce->getUsers().begin())) { in ApplyOptimization()
84 if (VLOG_IS_ON(2)) all_reduce.dump(); in ApplyOptimization()
112 if (!matchPattern(all_reduce.group_assignment(), in ApplyOptimization()
114 all_reduce.emitOpError("group_assignment should be a constant"); in ApplyOptimization()
131 mlir::OpBuilder builder(all_reduce); in ApplyOptimization()
133 all_reduce.getLoc(), in ApplyOptimization()
139 all_reduce.getLoc(), all_scatter->getResultTypes(), in ApplyOptimization()
140 all_reduce.getOperand(0), all_reduce.group_assignment(), in ApplyOptimization()
[all …]
Ddtensor_allreduce_combine_optimization.cc158 for (mlir::TF::DTensorAllReduceOp& all_reduce : all_reduce_group) { in MergeAllReduceGroup()
160 all_reduce.getType().dyn_cast<mlir::RankedTensorType>(); in MergeAllReduceGroup()
162 return all_reduce.emitOpError(llvm::formatv( in MergeAllReduceGroup()
197 mlir::TF::DTensorAllReduceOp& all_reduce = all_reduce_group[i]; in MergeAllReduceGroup() local
198 mlir::Location loc = all_reduce.getLoc(); in MergeAllReduceGroup()
200 all_reduce.getType().dyn_cast<mlir::RankedTensorType>(); in MergeAllReduceGroup()
202 return all_reduce.emitOpError(llvm::formatv( in MergeAllReduceGroup()
209 loc, all_reduce.input(), in MergeAllReduceGroup()
234 mlir::TF::DTensorAllReduceOp& all_reduce = all_reduce_group[i]; in MergeAllReduceGroup() local
235 mlir::Location loc = all_reduce.getLoc(); in MergeAllReduceGroup()
[all …]
Ddtensor_allreduce_sum_optimization.cc154 mlir::TF::DTensorAllReduceOp all_reduce = in OptimizeAllReduceAndSum() local
172 SetSingleLayoutOnOp(all_reduce, layout_or_status->value()); in OptimizeAllReduceAndSum()
177 all_reduce.output(), in OptimizeAllReduceAndSum()
178 llvm::SmallPtrSet<mlir::Operation*, 1>{all_reduce.getOperation()}); in OptimizeAllReduceAndSum()
290 mlir::TF::DTensorAllReduceOp all_reduce = in CheckWhileLoopOptimizationCriteria() local
293 if (all_reduce) { in CheckWhileLoopOptimizationCriteria()
297 all_reduce = llvm::dyn_cast_or_null<mlir::TF::DTensorAllReduceOp>( in CheckWhileLoopOptimizationCriteria()
302 if (!block_arg || !all_reduce) return false; in CheckWhileLoopOptimizationCriteria()
307 all_reduce.group_assignment().getDefiningOp(); in CheckWhileLoopOptimizationCriteria()
311 if (all_reduce.reduce_op().str() != kReduceOpAdd) return false; in CheckWhileLoopOptimizationCriteria()
[all …]
/external/pytorch/test/distributed/_tensor/debug/
Dtest_comm_mode_features.py205 c10d_functional.all_reduce
211 c10d_functional.all_reduce
217 c10d_functional.all_reduce
272 c10d_functional.all_reduce
284 c10d_functional.all_reduce
296 c10d_functional.all_reduce
302 c10d_functional.all_reduce
308 c10d_functional.all_reduce
314 c10d_functional.all_reduce
321 ][c10d_functional.all_reduce],
[all …]
/external/tensorflow/tensorflow/dtensor/mlir/utils/
Dcollective_lowering.cc91 mlir::TF::DTensorAllReduceOp all_reduce, in EmitAllReduceForXla() argument
96 all_reduce.getLoc(), all_reduce.getResult().getType(), all_reduce.input(), in EmitAllReduceForXla()
97 all_reduce.group_assignment(), all_reduce.reduce_opAttr(), in EmitAllReduceForXla()
205 mlir::TF::DTensorAllReduceOp all_reduce, mlir::Value* value) { in LowerAllReduceOpImpl() argument
206 mlir::Location loc = all_reduce.getLoc(); in LowerAllReduceOpImpl()
208 ExtractRequiredSingleLayoutFromOp(all_reduce); in LowerAllReduceOpImpl()
210 return all_reduce.emitOpError(output_layout.status().error_message()); in LowerAllReduceOpImpl()
213 if (!matchPattern(all_reduce.group_assignment(), in LowerAllReduceOpImpl()
221 const bool is_tpu = all_reduce.device_type().endswith("TPU"); in LowerAllReduceOpImpl()
227 if (mlir::failed(EmitAllReduceForXla(context, builder, all_reduce, in LowerAllReduceOpImpl()
[all …]
/external/tensorflow/tensorflow/python/ops/
Dcollective_ops_gpu_test.py69 collectives.append(collective_ops.all_reduce(
88 collective_ops.all_reduce(
111 collectives.append(collective_ops.all_reduce(
133 collectives.append(collective_ops.all_reduce(
235 collective_ops.all_reduce(in0, self._group_size, group_key,
239 collective_ops.all_reduce(in1, self._group_size, group_key,
255 c0 = collective_ops.all_reduce(
260 c1 = collective_ops.all_reduce(
279 collective_ops.all_reduce(
296 collective_ops.all_reduce(
[all …]
/external/pytorch/test/distributed/tensor/parallel/
Dtest_tp_examples.py47 reduce_scatter, all_gather, all_reduce = ( variable
50 c10d_functional.all_reduce,
135 self.assertEqual(comm_mode.get_comm_counts()[c10d_functional.all_reduce], 1)
140 dist.all_reduce(model.net1.weight.grad)
141 dist.all_reduce(model.net1.bias.grad)
142 dist.all_reduce(model.net2.weight.grad)
143 dist.all_reduce(model.net2.bias.grad)
286 fwd={all_reduce: 6, all_gather: 1}, bwd={all_reduce: 9}
291 optim={all_reduce: 30},
345 bwd={reduce_scatter: 5, all_gather: 6}, optim={all_reduce: 30}
[all …]
/external/tensorflow/tensorflow/python/kernel_tests/
Dcollective_ops_multi_worker_test.py109 collective_ops.all_reduce(
182 collective_ops.all_reduce(
200 collective_ops.all_reduce(
210 collective_ops.all_reduce(
222 collective_ops.all_reduce(
253 collective_ops.all_reduce(in_tensor, group_size, group_key,
259 collective_ops.all_reduce(in_tensor, group_size, group_key,
267 collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key)
282 collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key)
Dcollective_ops_test.py47 def all_reduce(t, group_size, group_key, instance_key, *args, **kwargs): member in CollectiveOpsV1
49 return _collective_ops.all_reduce(t, group_size, group_key, instance_key,
65 def all_reduce(t, group_size, group_key, instance_key, *args, **kwargs): member in CollectiveOpsV2
107 combinations.NamedObject('all_reduce', CollectiveOpsV1.all_reduce),
109 combinations.NamedObject('all_reduce_v2', CollectiveOpsV2.all_reduce),
144 return collective_ops.all_reduce(
161 collective_ops.all_reduce(
170 collective_ops.all_reduce(
329 collective_ops.all_reduce(
338 collective_ops.all_reduce(
[all …]
/external/tensorflow/tensorflow/compiler/mlir/xla/tests/
Dlegalize-tf-collective.mlir10 // CHECK: "mhlo.all_reduce"
26 // CHECK: "mhlo.all_reduce"
32 // CHECK: "mhlo.all_reduce"
47 // CHECK: "mhlo.all_reduce"
56 // CHECK: "mhlo.all_reduce"
66 // CHECK: %[[REDUCE:.*]] = "mhlo.all_reduce"
77 // CHECK: "mhlo.all_reduce"
86 // CHECK: "mhlo.all_reduce"
103 // CHECK: "mhlo.all_reduce"
109 // CHECK: "mhlo.all_reduce"
[all …]
/external/pytorch/test/distributed/
Dtest_multi_threaded_pg.py199 dist.all_reduce(output)
220 dist.all_reduce(tensor, op=ReduceOp.PRODUCT)
225 dist.all_reduce(tensor, op=ReduceOp.MIN)
229 dist.all_reduce(tensor, op=ReduceOp.MAX)
233 dist.all_reduce(tensor, op=ReduceOp.BAND)
238 dist.all_reduce(tensor, op=ReduceOp.BOR)
243 dist.all_reduce(tensor, op=ReduceOp.BXOR)
264 dist.all_reduce(output, group=subpg0)
266 dist.all_reduce(output, group=subpg1)
276 dist.all_reduce(x, group=pg)
[all …]
Dtest_compute_comm_reordering.py110 ar = _functional_collectives.all_reduce(a, "sum", "0")
152 e = _functional_collectives.all_reduce(b, "sum", "0")
199 e = _functional_collectives.all_reduce(b, "sum", "0")
248 ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
253 fr = _functional_collectives.all_reduce(f, "sum", ranks, tag)
305 ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
310 fr = _functional_collectives.all_reduce(f, "sum", ranks, tag)
355 ar = _functional_collectives.all_reduce(div, "sum", ranks, tag)
Dtest_inductor_collectives.py104 ar = torch.ops.c10d_functional.all_reduce(z, "sum", tag, ranks, group_size)
138 dist.all_reduce(y, op=dist.ReduceOp.SUM)
144 y = dist.all_reduce(y, op=dist.ReduceOp.SUM)
167 op = torch.ops._c10d_functional.all_reduce.default
169 op = torch.ops.c10d_functional.all_reduce.default
179 ar = torch.ops.c10d_functional.all_reduce(z, "sum", tag, ranks, group_size)
218 ar = torch.ops.c10d_functional.all_reduce(z, "sum", tag, ranks, group_size)
251 ar = _functional_collectives.all_reduce(a, "sum", ranks, tag)
525 ar = torch.ops.c10d_functional.all_reduce(
560 ar = torch.ops.c10d_functional.all_reduce(x, "sum", tag, ranks, group_size)
[all …]
/external/pytorch/torch/distributed/algorithms/ddp_comm_hooks/
Ddefault_hooks.py27 dist.all_reduce(tensor, group=group_to_use, async_op=True)
91 grad = dist._functional_collectives.all_reduce(
96 fut = dist.all_reduce(
140 grad = dist._functional_collectives.all_reduce(
145 fut = dist.all_reduce(
/external/tensorflow/third_party/nccl/
Darchive.patch5 diff --git a/src/collectives/device/all_reduce.cu b/src/collectives/device/all_reduce.cu.cc
7 rename from src/collectives/device/all_reduce.cu
8 rename to src/collectives/device/all_reduce.cu.cc
/external/pytorch/torch/distributed/algorithms/
Djoin.py284 dist.all_reduce(num_nonjoined_procs, group=self._process_group)
293 dist.all_reduce(ones, group=self._process_group)
337 work = dist.all_reduce(ones, group=process_group, async_op=True)
342 dist.all_reduce(zeros, group=process_group)

123456