1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/llvm_ir/ops.h"
17 #include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
18 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
19 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
20 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
21 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
22
23 namespace xla {
24 namespace llvm_ir {
25
CanUpdateDynamicSliceInPlace(HloInstruction * dynamic_update_slice,const BufferAssignment & assignment)26 bool CanUpdateDynamicSliceInPlace(HloInstruction* dynamic_update_slice,
27 const BufferAssignment& assignment) {
28 CHECK_EQ(HloOpcode::kDynamicUpdateSlice, dynamic_update_slice->opcode());
29 const HloInstruction* operand = dynamic_update_slice->operand(0);
30 return assignment.HasTopLevelAllocation(dynamic_update_slice) &&
31 assignment.HasTopLevelAllocation(operand) &&
32 assignment.SharesTopLevelSlice(dynamic_update_slice, operand);
33 }
34
35 // Shared implementation of EmitDynamicUpdateSliceInPlace and
36 // EmitFusedDynamicUpdateSliceInPlace.
37 //
38 // Emits a sequential loop if launch_dimensions is null.
EmitDynamicUpdateSliceInPlaceImpl(const Shape & update_shape,const ElementGenerator & start_indices_generator,ElementGenerator update_array_generator,const IrArray & output_array,const gpu::LaunchDimensions * launch_dimensions,tensorflow::StringPiece name,llvm::IRBuilder<> * ir_builder)39 static Status EmitDynamicUpdateSliceInPlaceImpl(
40 const Shape& update_shape, const ElementGenerator& start_indices_generator,
41 ElementGenerator update_array_generator, const IrArray& output_array,
42 const gpu::LaunchDimensions* launch_dimensions,
43 tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder) {
44 const Shape& output_shape = output_array.GetShape();
45
46 // Read start indices from start_indices_generator.
47 const int64 rank = ShapeUtil::Rank(output_shape);
48 IrArray::Index start_index(rank);
49 for (int64 i = 0; i < rank; ++i) {
50 IrArray::Index dim_index({ir_builder->getInt64(i)});
51 TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
52 }
53
54 auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
55 // Calculate output_index, where we'll write the value from update. For
56 // each dimension,
57 //
58 // output_index[dim] = (start_index[dim] + update_index[dim]) % dim_size.
59 //
60 IrArray::Index output_index(rank);
61 for (int64 i = 0; i < rank; ++i) {
62 llvm::Value* dim_size = llvm::ConstantInt::get(
63 update_index[i]->getType(), output_shape.dimensions(i));
64 llvm::Value* start_index0 = ir_builder->CreateZExtOrBitCast(
65 start_index[i], update_index[i]->getType());
66 output_index[i] = ir_builder->CreateURem(
67 ir_builder->CreateAdd(start_index0, update_index[i]), dim_size);
68 }
69
70 // Do output[output_index] = update[update_index].
71 TF_ASSIGN_OR_RETURN(llvm::Value * update_data,
72 update_array_generator(update_index));
73 output_array.EmitWriteArrayElement(output_index, update_data, ir_builder);
74 return Status::OK();
75 };
76
77 if (launch_dimensions != nullptr) {
78 return gpu::ParallelLoopEmitter(loop_body_emitter, update_shape,
79 *launch_dimensions, ir_builder)
80 .EmitLoop(name);
81 }
82 return LoopEmitter(loop_body_emitter, update_shape, ir_builder)
83 .EmitLoop(name);
84 }
85
EmitDynamicUpdateSliceInPlace(tensorflow::gtl::ArraySlice<IrArray> operand_arrays,const IrArray & output_array,tensorflow::StringPiece name,llvm::IRBuilder<> * ir_builder)86 Status EmitDynamicUpdateSliceInPlace(
87 tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
88 const IrArray& output_array, tensorflow::StringPiece name,
89 llvm::IRBuilder<>* ir_builder) {
90 VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name;
91
92 // No need to use operand_arrays[0], the input array of the
93 // dynamic-update-slice, because we know it aliases the op's output.
94 IrArray update_array = operand_arrays[1];
95 IrArray start_indices_array = operand_arrays[2];
96 Shape output_shape = output_array.GetShape();
97 Shape update_shape = update_array.GetShape();
98
99 ElementGenerator start_indices_generator = [&](const IrArray::Index& index) {
100 return start_indices_array.EmitReadArrayElement(index, ir_builder);
101 };
102 ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
103 return update_array.EmitReadArrayElement(index, ir_builder);
104 };
105
106 return EmitDynamicUpdateSliceInPlaceImpl(
107 update_shape, start_indices_generator, update_array_generator,
108 output_array, /*launch_dimensions=*/nullptr, name, ir_builder);
109 }
110
111 // Shared implementation for EmitFusedDynamicUpdateSliceInPlace and
112 // EmitParallelFusedDynamicUpdateSliceInPlace.
113 //
114 // Emits a sequential loop if launch_dimensions is null.
EmitFusedDynamicUpdateSliceInPlaceImpl(HloInstruction * fusion,tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,const IrArray & fusion_output_array,ElementalIrEmitter * elemental_emitter,const gpu::LaunchDimensions * launch_dimensions,llvm::IRBuilder<> * ir_builder)115 static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
116 HloInstruction* fusion,
117 tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
118 const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
119 const gpu::LaunchDimensions* launch_dimensions,
120 llvm::IRBuilder<>* ir_builder) {
121 CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
122 VLOG(2) << "EmitFusedDynamicUpdateSliceInPlace for "
123 << fusion->ToShortString();
124
125 auto* dynamic_update_slice = fusion->fused_expression_root();
126
127 const auto* update = dynamic_update_slice->operand(1);
128 const auto* start_indices = dynamic_update_slice->operand(2);
129 Shape update_shape = update->shape();
130
131 // Our in-place dynamic-update-slice implementation emits a loop over
132 // update_shape. To emit a cache-friendly loop, we need to know that shape's
133 // layout.
134 //
135 // update_shape is inside a fusion node -- it's never materialized in memory
136 // and thus doesn't have a layout. In this case we use the layout of the
137 // fusion node for iteration, since that corresponds to the order in memory of
138 // the buffer we'll be writing to.
139 //
140 // (This isn't necessarily optimal; in some cases it might be faster to peek
141 // through the chain of ops that gives us the update operand and use the
142 // layout of its source buffer(s). But this is no worse than we do with
143 // fusion elsewhere.)
144 TF_RETURN_IF_ERROR(
145 LayoutUtil::CopyLayoutBetweenShapes(fusion->shape(), &update_shape));
146
147 // Create element generators for update and start_indices.
148 FusedIrEmitter fused_emitter(fusion_operand_arrays, elemental_emitter);
149 TF_RETURN_IF_ERROR(dynamic_update_slice->Accept(&fused_emitter));
150 ElementGenerator update_array_generator = fused_emitter.GetGenerator(update);
151 ElementGenerator start_indices_generator =
152 fused_emitter.GetGenerator(start_indices);
153
154 return EmitDynamicUpdateSliceInPlaceImpl(
155 update_shape, start_indices_generator, update_array_generator,
156 fusion_output_array, launch_dimensions, IrName(fusion), ir_builder);
157 }
158
EmitFusedDynamicUpdateSliceInPlace(HloInstruction * fusion,tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,const IrArray & fusion_output_array,ElementalIrEmitter * elemental_emitter,llvm::IRBuilder<> * ir_builder)159 Status EmitFusedDynamicUpdateSliceInPlace(
160 HloInstruction* fusion,
161 tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
162 const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
163 llvm::IRBuilder<>* ir_builder) {
164 return EmitFusedDynamicUpdateSliceInPlaceImpl(
165 fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
166 /*launch_dimensions=*/nullptr, ir_builder);
167 }
168
EmitParallelFusedDynamicUpdateSliceInPlace(HloInstruction * fusion,tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,const IrArray & fusion_output_array,ElementalIrEmitter * elemental_emitter,const gpu::LaunchDimensions & launch_dimensions,llvm::IRBuilder<> * ir_builder)169 Status EmitParallelFusedDynamicUpdateSliceInPlace(
170 HloInstruction* fusion,
171 tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
172 const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
173 const gpu::LaunchDimensions& launch_dimensions,
174 llvm::IRBuilder<>* ir_builder) {
175 return EmitFusedDynamicUpdateSliceInPlaceImpl(
176 fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
177 &launch_dimensions, ir_builder);
178 }
179
180 } // namespace llvm_ir
181 } // namespace xla
182