1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
17
18 #include <set>
19 #include <utility>
20 #include <vector>
21
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/map_util.h"
25 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
26 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
27 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
28 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
29 #include "tensorflow/compiler/xla/service/logical_buffer.h"
30 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
31 #include "tensorflow/compiler/xla/service/transfer_manager.h"
32 #include "tensorflow/compiler/xla/shape_tree.h"
33 #include "tensorflow/compiler/xla/shape_util.h"
34 #include "tensorflow/compiler/xla/status_macros.h"
35 #include "tensorflow/compiler/xla/util.h"
36 #include "tensorflow/core/platform/logging.h"
37 #include "tensorflow/core/platform/tracing.h"
38 #include "tensorflow/core/platform/types.h"
39
40 namespace xla {
41 namespace gpu {
42 namespace {
43
44 using tensorflow::tracing::ScopedAnnotation;
45
46 } // namespace
47
48 // Implementation note: HLO profiling is always enabled for GPU executables,
49 // since we can use timers around thunks.
GpuExecutable(const string & ptx,const std::vector<uint8> & cubin,std::pair<int,int> compute_capability,std::unique_ptr<const ThunkSchedule> thunk_schedule,std::unique_ptr<HloModule> hlo_module,std::unique_ptr<const BufferAssignment> assignment,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)50 GpuExecutable::GpuExecutable(
51 const string& ptx, const std::vector<uint8>& cubin,
52 std::pair<int, int> compute_capability,
53 std::unique_ptr<const ThunkSchedule> thunk_schedule,
54 std::unique_ptr<HloModule> hlo_module,
55 std::unique_ptr<const BufferAssignment> assignment,
56 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
57 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
58 : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
59 std::move(hlo_profile_index_map)),
60 ptx_(ptx),
61 cubin_(cubin),
62 compute_capability_(compute_capability),
63 thunk_schedule_(std::move(thunk_schedule)),
64 assignment_(std::move(assignment)) {}
65
ExecuteThunks(const ServiceExecutableRunOptions * run_options,const BufferAllocations & buffer_allocations,bool block_host_until_done,HloExecutionProfile * hlo_execution_profile)66 Status GpuExecutable::ExecuteThunks(
67 const ServiceExecutableRunOptions* run_options,
68 const BufferAllocations& buffer_allocations, bool block_host_until_done,
69 HloExecutionProfile* hlo_execution_profile) {
70 se::Stream* main_stream = run_options->stream();
71 se::StreamExecutor* executor = main_stream->parent();
72
73 std::pair<int, int> stream_compute_compatibility;
74 executor->GetDeviceDescription().cuda_compute_capability(
75 &stream_compute_compatibility.first,
76 &stream_compute_compatibility.second);
77 TF_RET_CHECK(stream_compute_compatibility == compute_capability_)
78 << "Compute capability mismatch; expected {" << compute_capability_.first
79 << ", " << compute_capability_.second << "}, but was {"
80 << stream_compute_compatibility.first << ", "
81 << stream_compute_compatibility.second << "}";
82
83 bool do_profile = hlo_execution_profile != nullptr;
84 if (do_profile) {
85 LOG(WARNING) << "PROFILING: profiling is enabled";
86 }
87
88 // Stream 0 indicates `main_stream` and substreams start from stream 1.
89 std::vector<StreamPool::Ptr> sub_streams;
90 sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
91 while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
92 sub_streams.emplace_back();
93 TF_ASSIGN_OR_RETURN(sub_streams.back(),
94 run_options->BorrowStream(executor->device_ordinal()));
95 }
96
97 HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
98 sub_streams, hlo_module_->entry_computation());
99 uint64 start_micros = tensorflow::Env::Default()->NowMicros();
100
101 // This top-level trace serves two purposes:
102 // 1) It marks the scope of the whole XLA module.
103 // 2) It tells us whether tracing is enabled. We use this to avoid the
104 // expensive HloInstruction::ToString() calls inside the loop below if
105 // tracing is disabled.
106 ScopedAnnotation top_level_annotation(hlo_module_->name(), "XLA GPU module");
107
108 std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
109 for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
110 // Annotate execution of this op if tracing was enabled when we started
111 // running this module. If tracing is enabled *while* we're running the
112 // module, we won't get any data, but that's probably an OK trade-off.
113 //
114 // TODO(jlebar): Should we cache the results of HloInstruction::ToString(),
115 // since we expect it to be an expensive call?
116 absl::optional<ScopedAnnotation> op_annotation;
117 if (top_level_annotation.IsEnabled()) {
118 op_annotation.emplace(
119 thunk->hlo_instruction() != nullptr
120 ? thunk->hlo_instruction()->ToString(HloPrintOptions::Canonical())
121 : "<unknown>",
122 "XLA op");
123 }
124
125 TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
126 int32 stream_no =
127 thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
128 se::Stream* stream =
129 (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get());
130
131 for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) {
132 stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
133 }
134
135 // If this thunk is about to autotune then wait for all currently executing
136 // thunks to finish. This reduces noise and thus the probability of
137 // choosing a suboptimal algorithm.
138 if (thunk->WillAutotuneKernel(stream)) {
139 TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone());
140 }
141
142 VLOG(2) << "Executing the thunk for "
143 << thunk->hlo_instruction()->ToString() << " on stream "
144 << stream_no;
145 TF_RETURN_IF_ERROR(
146 thunk->ExecuteOnStream(buffer_allocations, stream, &profiler));
147 if (thunk_schedule_->Depended(thunk)) {
148 auto finish_event = absl::make_unique<se::Event>(main_stream->parent());
149 finish_event->Init();
150 stream->ThenRecordEvent(finish_event.get());
151 thunk_to_finish_event[thunk] = std::move(finish_event);
152 }
153 }
154
155 main_stream->ThenWaitFor(&sub_streams);
156 // Make sure kernels are completed before deallocating temporary buffers.
157 // TODO(b/30100571): we could potentially postpone deallocating the temp
158 // buffers until a different computation is executed.
159 if (block_host_until_done) {
160 Status block_status = main_stream->BlockHostUntilDone();
161 if (!block_status.ok()) {
162 return InternalError(
163 "Failed to complete all kernels launched on stream %p: %s",
164 main_stream, block_status.error_message());
165 }
166 }
167
168 profiler.FinishExecution();
169 uint64 end_micros = tensorflow::Env::Default()->NowMicros();
170
171 {
172 tensorflow::mutex_lock lock(mutex_);
173 const double nanoseconds = (end_micros - start_micros) * 1000.0;
174 execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
175
176 // If hlo profiling was disabled then the cycle count is left empty.
177 if (do_profile) {
178 execution_profile_.set_compute_cycle_count(
179 hlo_execution_profile->total_cycles_executed(
180 *module().entry_computation()));
181 }
182 }
183
184 return Status::OK();
185 }
186
187 StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*>
ResolveConstantGlobals(se::StreamExecutor * executor)188 GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
189 tensorflow::mutex_lock lock(module_handle_mutex_);
190 auto it = module_globals_.find(executor);
191 if (it != module_globals_.end()) {
192 return &it->second;
193 }
194
195 se::MultiModuleLoaderSpec module_spec;
196 if (!cubin().empty()) {
197 module_spec.AddCudaCubinInMemory(cubin());
198 }
199 module_spec.AddCudaPtxInMemory(ptx().c_str());
200
201 absl::flat_hash_map<int64, se::DeviceMemoryBase> globals;
202 se::ModuleHandle module_handle;
203 executor->LoadModule(module_spec, &module_handle);
204
205 for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
206 ++i) {
207 const BufferAllocation& allocation = assignment_->GetAllocation(i);
208 if (allocation.is_constant()) {
209 TF_ASSIGN_OR_RETURN(
210 se::DeviceMemoryBase global,
211 executor->GetUntypedSymbol(
212 llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
213 module_handle));
214 VLOG(3) << "Resolved global "
215 << llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
216 << " to " << global.opaque();
217 InsertOrDie(&globals, i, global);
218
219 const Literal& literal =
220 llvm_ir::LiteralForConstantAllocation(allocation);
221 CHECK(literal.shape().IsArray());
222 if (!ShouldEmitLiteralInLlvmIr(literal)) {
223 VLOG(3) << "H2D memcpy for constant with shape "
224 << ShapeUtil::HumanString(literal.shape());
225 TF_RETURN_IF_ERROR(executor->SynchronousMemcpyH2D(
226 literal.untyped_data(), allocation.size(), &global));
227 }
228 }
229 }
230
231 module_handles_.emplace(executor,
232 se::ScopedModuleHandle(executor, module_handle));
233 return &module_globals_.emplace(executor, std::move(globals)).first->second;
234 }
235
ExecuteOnStream(const ServiceExecutableRunOptions * run_options,absl::Span<const ShapedBuffer * const> arguments,HloExecutionProfile * hlo_execution_profile)236 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
237 const ServiceExecutableRunOptions* run_options,
238 absl::Span<const ShapedBuffer* const> arguments,
239 HloExecutionProfile* hlo_execution_profile) {
240 DeviceMemoryAllocator* memory_allocator = run_options->allocator();
241
242 if (GetRootPointsToSet().IsAmbiguous()) {
243 return Unimplemented("Points-to set of root instruction is ambiguous");
244 }
245
246 BufferAllocations::Builder buffer_allocations_builder;
247 se::StreamExecutor* executor = run_options->stream()->parent();
248
249 TF_ASSIGN_OR_RETURN(auto* const globals, ResolveConstantGlobals(executor));
250
251 for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
252 ++i) {
253 const BufferAllocation& allocation = assignment_->GetAllocation(i);
254 if (allocation.is_entry_computation_parameter()) {
255 auto param_no = allocation.parameter_number();
256 se::DeviceMemoryBase buffer =
257 arguments[param_no]->buffer(allocation.param_shape_index());
258
259 // All top-level buffers and sub-buffers must have an explicit, non-null
260 // pointer, except for zero-sized buffers, which may be null.
261 if (buffer.is_null() && buffer.size() > 0) {
262 return FailedPrecondition(
263 "Cannot run XLA computation because pointer to (sub-)buffer at "
264 "index %s of parameter %d was null. All pointers to (sub-)buffers "
265 "must not be null, unless the (sub-)buffer has zero elements.",
266 allocation.param_shape_index().ToString(), param_no);
267 }
268
269 buffer_allocations_builder.RegisterBuffer(i, buffer);
270 }
271
272 if (allocation.is_constant()) {
273 buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i));
274 }
275 }
276
277 TF_ASSIGN_OR_RETURN(
278 auto buffer_allocations,
279 buffer_allocations_builder.Build(
280 assignment_.get(), executor->device_ordinal(), memory_allocator));
281
282 bool block_host_until_done =
283 !memory_allocator->AllowsAsynchronousDeallocation();
284 TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
285 block_host_until_done,
286 hlo_execution_profile));
287
288 HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
289 auto device_ordinal = executor->device_ordinal();
290 ScopedShapedBuffer shaped_buffer(root->shape(), root->shape(),
291 memory_allocator, device_ordinal);
292
293 // Copy DeviceMemoryBase values which contain the array(s) of the result into
294 // the respective location in ShapedBuffer.
295 std::set<se::DeviceMemoryBase> buffers_in_result;
296 TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus(
297 [&buffer_allocations, &buffers_in_result, this](
298 const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
299 const auto& sources = this->GetRootPointsToSet().element(index);
300 // The points-to set is unambiguous so the set should be a
301 // singleton. That is, we know exactly which instruction
302 // produced the array at this element.
303 CHECK_EQ(1, sources.size());
304 auto src_hlo = sources[0]->instruction();
305
306 VLOG(4) << "Looking at: " << sources[0];
307
308 // The source instruction should have a non-parameter buffer
309 // assigned.
310 TF_ASSIGN_OR_RETURN(
311 const BufferAllocation::Slice slice,
312 this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
313
314 se::DeviceMemoryBase src_base =
315 buffer_allocations->GetDeviceAddress(slice.index());
316 CHECK(!src_base.is_null() || src_base.size() == 0);
317 if (!slice.allocation()->is_entry_computation_parameter()) {
318 // If the buffer coming out of the result is from a parameter, it
319 // means the caller aliased some parameter buffer to an output one
320 // (via the HloInputOutputAliasConfig API). If that is the case, the
321 // caller will receive a partially complete scoped shaped buffer,
322 // which they will have to fill up on return.
323 // Unfortunately the interface to the execute APIs are ShapedBuffer
324 // pointer based, which assumes caller ownership, and hence a buffer
325 // coming from there cannot be part of the new ScopedShapedBuffer we
326 // create for the result (which assumes ownership).
327 *device_memory = src_base;
328 } else {
329 const HloInputOutputAliasConfig& input_output_alias =
330 module().input_output_alias_config();
331 auto output_alias = input_output_alias.GetAliasedOutput(
332 slice.allocation()->parameter_number(),
333 slice.allocation()->param_shape_index());
334 CHECK(output_alias)
335 << "Ouput buffer is coming from parameter "
336 << slice.allocation()->parameter_number() << " at index "
337 << slice.allocation()->param_shape_index()
338 << ", but no alias exists";
339 CHECK_EQ(*output_alias, index);
340 }
341 buffers_in_result.insert(src_base);
342 return Status::OK();
343 }));
344 TF_RETURN_IF_ERROR(buffer_allocations->TearDown(buffers_in_result));
345
346 return std::move(shaped_buffer);
347 }
348
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,absl::Span<const ShapedBuffer * const> arguments)349 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
350 const ServiceExecutableRunOptions* run_options,
351 absl::Span<const ShapedBuffer* const> arguments) {
352 // TODO(b/30671675): Implement asynchronous execution mode.
353 return Unimplemented(
354 "Asynchronous execution on stream is not yet supported on GPU.");
355 }
356
GetRootPointsToSet() const357 const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
358 return assignment_->points_to_analysis().GetPointsToSet(
359 module().entry_computation()->root_instruction());
360 }
361
362 } // namespace gpu
363 } // namespace xla
364