1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
17
18 #include <set>
19 #include <utility>
20 #include <vector>
21
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/map_util.h"
25 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
26 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
27 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
28 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
29 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
30 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
31 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
32 #include "tensorflow/compiler/xla/service/logical_buffer.h"
33 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
34 #include "tensorflow/compiler/xla/service/transfer_manager.h"
35 #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
36 #include "tensorflow/compiler/xla/shape_tree.h"
37 #include "tensorflow/compiler/xla/shape_util.h"
38 #include "tensorflow/compiler/xla/status_macros.h"
39 #include "tensorflow/compiler/xla/util.h"
40 #include "tensorflow/core/lib/gtl/map_util.h"
41 #include "tensorflow/core/platform/errors.h"
42 #include "tensorflow/core/platform/logging.h"
43 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
44 #include "tensorflow/core/profiler/lib/traceme.h"
45 #include "tensorflow/stream_executor/platform.h"
46
47 namespace xla {
48 namespace gpu {
49 namespace {
50
51 using ::tensorflow::profiler::ScopedAnnotation;
52
53 } // namespace
54
55 // Implementation note: HLO profiling is always enabled for GPU executables,
56 // since we can use timers around thunks.
GpuExecutable(GpuExecutable::Params params)57 GpuExecutable::GpuExecutable(GpuExecutable::Params params)
58 : Executable(std::move(params.debug_module),
59 std::move(params.hlo_profile_printer_data),
60 std::move(params.hlo_profile_index_map)),
61 text_(std::move(params.asm_text)),
62 binary_(std::move(params.binary)),
63 gpu_version_(params.gpu_version),
64 thunk_schedule_(std::move(params.thunk_schedule)),
65 module_name_(params.module_name),
66 output_shape_(params.output_shape),
67 allocations_(std::move(params.allocations)),
68 debug_buffer_assignment_(std::move(params.debug_buffer_assignment)),
69 entry_computation_profile_index_(params.entry_computation_profile_index),
70 constants_(std::move(params.constants)),
71 output_info_(std::move(params.output_info)) {
72 XlaDebugInfoManager::Get()->RegisterModule(module_name_, shared_module(),
73 debug_buffer_assignment_);
74 }
75
~GpuExecutable()76 GpuExecutable::~GpuExecutable() {
77 XlaDebugInfoManager::Get()->UnregisterModule(module_name_, shared_module(),
78 debug_buffer_assignment_);
79
80 {
81 // We could have issued host->device mem copies in ResolveConstantGlobals.
82 // Wait for those to finish so that we can safely deallocate the backing HLO
83 // module.
84 //
85 // We need for the host->device memcpies to finish they are concurrently
86 // reading memory (xla::Literal's) owned by the HLO module.
87 tensorflow::mutex_lock lock(module_handle_mutex_);
88 for (const auto& pair : module_globals_) {
89 CHECK(pair.first->SynchronizeAllActivity());
90 }
91 }
92 }
93
CheckCompatibilityWithServiceExecutableRunOptions(const ServiceExecutableRunOptions * run_options)94 Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
95 const ServiceExecutableRunOptions* run_options) {
96 se::Stream* main_stream = run_options->stream();
97
98 stream_executor::PlatformKind platform_kind =
99 main_stream->parent()->platform_kind();
100 if (platform_kind == stream_executor::PlatformKind::kROCm) {
101 int stream_isa_version;
102 main_stream->parent()->GetDeviceDescription().rocm_amdgpu_isa_version(
103 &stream_isa_version);
104 int gpu_exec_isa_version =
105 absl::get<std::pair<int, std::string>>(gpu_version_).first;
106 TF_RET_CHECK(stream_isa_version == gpu_exec_isa_version)
107 << "AMDGPU GCN ISA version mismatch; expected {" << gpu_exec_isa_version
108 << ", but was " << stream_isa_version;
109 } else if (platform_kind == stream_executor::PlatformKind::kCuda) {
110 std::pair<int, int> stream_compute_compatibility;
111 main_stream->parent()->GetDeviceDescription().cuda_compute_capability(
112 &stream_compute_compatibility.first,
113 &stream_compute_compatibility.second);
114 GpuVersion nvidia_compute_compatibility = stream_compute_compatibility;
115 TF_RET_CHECK(nvidia_compute_compatibility == gpu_version_)
116 << "Compute capability mismatch; expected {"
117 << absl::get<std::pair<int, int>>(gpu_version_).first << ", "
118 << absl::get<std::pair<int, int>>(gpu_version_).second << "}, but was {"
119 << stream_compute_compatibility.first << ", "
120 << stream_compute_compatibility.second << "}";
121 } else {
122 return InternalError("Unknown platform: %d", platform_kind);
123 }
124
125 return Status::OK();
126 }
127
ExecuteThunks(const ServiceExecutableRunOptions * run_options,const BufferAllocations & buffer_allocations,bool block_host_until_done,HloExecutionProfile * hlo_execution_profile)128 Status GpuExecutable::ExecuteThunks(
129 const ServiceExecutableRunOptions* run_options,
130 const BufferAllocations& buffer_allocations, bool block_host_until_done,
131 HloExecutionProfile* hlo_execution_profile) {
132 TF_RETURN_IF_ERROR(
133 CheckCompatibilityWithServiceExecutableRunOptions(run_options));
134 XlaDebugInfoManager::Get()->OnModuleStart(module_name_);
135 auto cleanup = MakeCleanup(
136 [&]() { XlaDebugInfoManager::Get()->OnModuleStop(module_name_); });
137
138 se::Stream* main_stream = run_options->stream();
139 se::StreamExecutor* executor = main_stream->parent();
140
141 bool do_profile = hlo_execution_profile != nullptr;
142 if (do_profile) {
143 LOG(WARNING) << "PROFILING: profiling is enabled";
144 }
145
146 // Stream 0 indicates `main_stream` and substreams start from stream 1.
147 std::vector<StreamPool::Ptr> sub_streams;
148 sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
149 while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
150 sub_streams.emplace_back();
151 TF_ASSIGN_OR_RETURN(sub_streams.back(),
152 run_options->BorrowStream(executor->device_ordinal()));
153 // Require substreams to wait for the main stream, otherwise substreams may
154 // execute before the program is scheduled to start on the main stream.
155 sub_streams.back()->ThenWaitFor(main_stream);
156 }
157
158 HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
159 sub_streams, entry_computation_profile_index_);
160 uint64 start_micros = tensorflow::Env::Default()->NowMicros();
161
162 tensorflow::profiler::TraceMe hlo_module_activity(
163 [&] { return absl::StrCat(module_name_, ":XLA GPU module"); },
164 tensorflow::profiler::TraceMeLevel::kInfo);
165
166 std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
167 std::vector<std::function<void()>> deferred_host_callbacks;
168 for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
169 // Annotate execution of this op if tracing was enabled when we started
170 // running this module. If tracing is enabled *while* we're running the
171 // module, we won't get any data, but that's probably an OK trade-off.
172 ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
173
174 int32 stream_no = thunk_schedule_->StreamNumberForThunk(thunk);
175 se::Stream* stream =
176 (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get());
177
178 for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) {
179 stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
180 }
181
182 VLOG(2) << "Executing the thunk for " << thunk->profile_annotation()
183 << " on stream " << stream_no;
184 const GpuExecutableRunOptions* gpu_options =
185 run_options->run_options().gpu_executable_run_options();
186 Thunk::ExecuteParams thunk_params{
187 &buffer_allocations,
188 stream,
189 run_options->run_options().run_id(),
190 &profiler,
191 run_options->run_options().device_assignment(),
192 &deferred_host_callbacks,
193 gpu_options && gpu_options->gpu_global_device_ids()
194 ? &*gpu_options->gpu_global_device_ids()
195 : nullptr,
196 gpu_options && gpu_options->nccl_unique_id_callback()
197 ? &gpu_options->nccl_unique_id_callback()
198 : nullptr};
199 TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(thunk_params));
200 if (thunk_schedule_->Depended(thunk)) {
201 auto finish_event = absl::make_unique<se::Event>(main_stream->parent());
202 finish_event->Init();
203 stream->ThenRecordEvent(finish_event.get());
204 thunk_to_finish_event[thunk] = std::move(finish_event);
205 }
206 }
207
208 main_stream->ThenWaitFor(&sub_streams);
209 if (!deferred_host_callbacks.empty()) {
210 auto fn = [deferred_host_callbacks{std::move(deferred_host_callbacks)}]() {
211 for (auto& callback : deferred_host_callbacks) {
212 callback();
213 }
214 };
215 if (run_options->run_options().then_execute_function()) {
216 (*run_options->run_options().then_execute_function())(main_stream,
217 std::move(fn));
218 } else {
219 main_stream->ThenDoHostCallback(std::move(fn));
220 }
221 }
222 // Make sure kernels are completed before deallocating temporary buffers or
223 // the profiler state.
224 // TODO(b/30100571): we could potentially postpone deallocating the temp
225 // buffers until a different computation is executed.
226 if (do_profile || block_host_until_done) {
227 Status block_status = main_stream->BlockHostUntilDone();
228 if (!block_status.ok()) {
229 return InternalError(
230 "Failed to complete all kernels launched on stream %p: %s",
231 main_stream, block_status.error_message());
232 }
233 }
234
235 // FinishExecution() blocks until main_stream has completed if profiling is
236 // enabled; we therefore do not need to defer profile collection onto a
237 // stream.
238 profiler.FinishExecution();
239 uint64 end_micros = tensorflow::Env::Default()->NowMicros();
240
241 if (run_options->run_options().execution_profile()) {
242 ExecutionProfile* profile = run_options->run_options().execution_profile();
243 const double nanoseconds = (end_micros - start_micros) * 1000.0;
244 profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
245
246 // If hlo profiling was disabled then the cycle count is left empty.
247 if (do_profile) {
248 profile->set_compute_cycle_count(hlo_execution_profile->GetCyclesTakenBy(
249 entry_computation_profile_index_));
250 }
251 }
252
253 return Status::OK();
254 }
255
256 StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*>
ResolveConstantGlobals(se::Stream * stream)257 GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
258 se::StreamExecutor* executor = stream->parent();
259
260 tensorflow::mutex_lock lock(module_handle_mutex_);
261 auto it = module_globals_.find(executor);
262 if (it != module_globals_.end()) {
263 return &it->second;
264 }
265
266 se::MultiModuleLoaderSpec module_spec;
267 if (!binary().empty()) {
268 module_spec.AddCudaCubinInMemory(binary());
269 }
270 module_spec.AddCudaPtxInMemory(text().c_str());
271
272 absl::flat_hash_map<int64, se::DeviceMemoryBase> globals;
273 if (executor->platform_kind() == se::PlatformKind::kCuda &&
274 module_spec.cuda_ptx_in_memory() == nullptr) {
275 // No custom PTX => no globals.
276 return &module_globals_.emplace(executor, std::move(globals)).first->second;
277 }
278
279 se::ModuleHandle module_handle;
280 TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
281
282 for (const auto& info : constants_) {
283 TF_ASSIGN_OR_RETURN(auto global, executor->GetUntypedSymbol(
284 info.symbol_name, module_handle));
285 VLOG(3) << "Resolved global " << info.symbol_name << " to "
286 << global.opaque();
287
288 if (!info.content.empty()) {
289 stream->ThenMemcpy(&global, info.content.data(), info.content.size());
290 }
291
292 if (info.allocation_index != -1) {
293 InsertOrDie(&globals, info.allocation_index, global);
294 }
295 }
296
297 module_handles_.emplace(executor,
298 se::ScopedModuleHandle(executor, module_handle));
299 return &module_globals_.emplace(executor, std::move(globals)).first->second;
300 }
301
BufferForAllocation(VariantArguments arguments,const GpuExecutable::BufferAllocToDeviceMemoryMap * globals,const BufferAllocation & allocation,se::DeviceMemoryAllocator * const memory_allocator,int device_ordinal,int64 arg_idx)302 StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
303 VariantArguments arguments,
304 const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
305 const BufferAllocation& allocation,
306 se::DeviceMemoryAllocator* const memory_allocator, int device_ordinal,
307 int64 arg_idx) {
308 if (allocation.is_thread_local()) {
309 return se::DeviceMemoryBase{};
310 } else if (allocation.is_entry_computation_parameter()) {
311 int64 param_no = allocation.parameter_number();
312 se::DeviceMemoryBase registered_buffer = [&] {
313 if (auto unowned_shapedbuffers =
314 absl::get_if<absl::Span<const ShapedBuffer* const>>(&arguments)) {
315 return (*unowned_shapedbuffers)[param_no]->buffers().element(
316 allocation.param_shape_index());
317 } else {
318 return absl::get<absl::Span<ExecutionInput>>(arguments)[param_no]
319 .Buffer(allocation.param_shape_index())
320 .AsDeviceMemoryBase();
321 }
322 }();
323 if (registered_buffer.is_null() && registered_buffer.size() > 0) {
324 return FailedPrecondition(
325 "Cannot run XLA computation because pointer to (sub-)buffer at "
326 "index %s of parameter %d was null. All pointers to "
327 "(sub-)buffers must not be null, unless the (sub-)buffer has "
328 "zero elements.",
329 allocation.param_shape_index().ToString(), param_no);
330 }
331 return registered_buffer;
332 } else if (allocation.is_constant()) {
333 auto it = globals->find(arg_idx);
334 if (it == globals->end()) {
335 return se::DeviceMemoryBase();
336 }
337 return it->second;
338 } else {
339 // Allocate each allocation that might escape, or is the temp buffer.
340 CHECK(allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer());
341 const int64 buffer_size = allocation.size();
342 se::DeviceMemoryBase buffer_address;
343 if (buffer_size > 0) {
344 TF_ASSIGN_OR_RETURN(
345 se::OwningDeviceMemory buffer,
346 memory_allocator->Allocate(device_ordinal, buffer_size));
347 buffer_address = buffer.Release();
348 }
349 return buffer_address;
350 }
351 }
352
CheckAlignment(const BufferAllocation & allocation,se::DeviceMemoryBase buffer,int arg_idx)353 static Status CheckAlignment(const BufferAllocation& allocation,
354 se::DeviceMemoryBase buffer, int arg_idx) {
355 const int64 expected_alignment = [&] {
356 if (allocation.is_entry_computation_parameter()) {
357 return kEntryParameterAlignBytes;
358 } else if (allocation.is_constant()) {
359 return kConstantBufferAlignBytes;
360 } else {
361 return kXlaAllocatedBufferAlignBytes;
362 }
363 }();
364 if (!buffer.is_null() &&
365 reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment != 0) {
366 return InternalError(
367 "Address of buffer %d must be a multiple of %x, but "
368 "was %p",
369 arg_idx, expected_alignment, buffer.opaque());
370 }
371 return Status::OK();
372 }
373
GenerateBufferAllocations(VariantArguments arguments,const GpuExecutable::BufferAllocToDeviceMemoryMap * globals,se::DeviceMemoryAllocator * const memory_allocator,se::StreamExecutor * executor)374 StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
375 VariantArguments arguments,
376 const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
377 se::DeviceMemoryAllocator* const memory_allocator,
378 se::StreamExecutor* executor) {
379 tensorflow::profiler::TraceMe hlo_module_activity(
380 [&] { return std::string("Build buffer allocations"); },
381 tensorflow::profiler::TraceMeLevel::kInfo);
382
383 const int64 num_buffers = allocations_.size();
384 std::vector<se::DeviceMemoryBase> buffers;
385 buffers.reserve(num_buffers);
386 for (int64 i = 0; i < num_buffers; ++i) {
387 const BufferAllocation& allocation = allocations_[i];
388 TF_ASSIGN_OR_RETURN(
389 se::DeviceMemoryBase buffer,
390 BufferForAllocation(arguments, globals, allocation, memory_allocator,
391 executor->device_ordinal(), i));
392 buffers.push_back(buffer);
393 TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i));
394 }
395 return {{buffers, executor->device_ordinal(), memory_allocator}};
396 }
397
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,std::vector<ExecutionInput> arguments,HloExecutionProfile * hlo_execution_profile)398 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
399 const ServiceExecutableRunOptions* run_options,
400 std::vector<ExecutionInput> arguments,
401 HloExecutionProfile* hlo_execution_profile) {
402 return ExecuteAsyncOnStreamImpl(run_options, absl::MakeSpan(arguments),
403 hlo_execution_profile);
404 }
405
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,absl::Span<const ShapedBuffer * const> arguments,HloExecutionProfile * hlo_execution_profile)406 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
407 const ServiceExecutableRunOptions* run_options,
408 absl::Span<const ShapedBuffer* const> arguments,
409 HloExecutionProfile* hlo_execution_profile) {
410 TF_ASSIGN_OR_RETURN(
411 ExecutionOutput out,
412 ExecuteAsyncOnStreamImpl(run_options, arguments, hlo_execution_profile));
413 return out.ConsumeResult();
414 }
415
ExecuteAsyncOnStreamImpl(const ServiceExecutableRunOptions * run_options,VariantArguments arguments,HloExecutionProfile * hlo_execution_profile)416 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
417 const ServiceExecutableRunOptions* run_options, VariantArguments arguments,
418 HloExecutionProfile* hlo_execution_profile) {
419 XLA_SCOPED_LOGGING_TIMER(absl::StrCat(
420 "GpuExecutable::ExecuteAsyncOnStreamImpl(", module_name_, ")"));
421 se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
422 // Force synchronous execution if the allocator requires it.
423 const bool block_host_until_done =
424 !memory_allocator->AllowsAsynchronousDeallocation();
425
426 const GpuExecutable::BufferAllocToDeviceMemoryMap* globals;
427 {
428 tensorflow::profiler::TraceMe hlo_module_activity(
429 [&] { return std::string("Resolve constant globals"); },
430 tensorflow::profiler::TraceMeLevel::kInfo);
431
432 TF_ASSIGN_OR_RETURN(globals, ResolveConstantGlobals(run_options->stream()));
433 }
434
435 se::StreamExecutor* executor = run_options->stream()->parent();
436
437 auto device_ordinal = executor->device_ordinal();
438 ExecutionOutput result(/*on_device_shape=*/output_shape_, memory_allocator,
439 device_ordinal);
440
441 TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
442 GenerateBufferAllocations(arguments, globals,
443 memory_allocator, executor));
444 VLOG(2) << buffer_allocations.ToString();
445 std::set<se::DeviceMemoryBase> buffers_in_result;
446
447 const bool is_entire_tuple_contents_aliased = [&] {
448 for (auto& p : result.MutableResult()->buffers().leaves()) {
449 const OutputInfo& output_info = output_info_.at(p.first);
450 if (!output_info.alias_config.has_value()) {
451 return false;
452 }
453 }
454 return true;
455 }();
456
457 for (auto& p : result.MutableResult()->buffers()) {
458 const ShapeIndex& index = p.first;
459 if (!output_info_.contains(index)) {
460 continue;
461 }
462 const OutputInfo& output_info = output_info_.at(index);
463 const BufferAllocation* allocation =
464 &allocations_[output_info.allocation_index];
465 se::DeviceMemoryBase& result_buffer = p.second;
466
467 VLOG(4) << "Looking at: allocation " << output_info.allocation_index
468 << " @ index: " << index.ToString();
469
470 if (output_info.alias_config) {
471 MaybeOwningDeviceMemory* maybe_owning_memory =
472 [&]() -> xla::MaybeOwningDeviceMemory* {
473 // ScopedBuffer is never an owned buffer.
474 if (auto* unowned_shapedbuffers =
475 absl::get_if<absl::Span<const ShapedBuffer* const>>(
476 &arguments)) {
477 return nullptr;
478 } else {
479 auto unowned_execution_input =
480 absl::get<absl::Span<ExecutionInput>>(arguments);
481 ExecutionInput& input =
482 unowned_execution_input[allocation->parameter_number()];
483 return input.MutableBuffer(allocation->param_shape_index());
484 }
485 }();
486 if (output_info.alias_config->must_alias() && maybe_owning_memory &&
487 !maybe_owning_memory->HasOwnership()) {
488 return InvalidArgument(
489 "An input was configured to be must-alias at "
490 "compile time but not donated at runtime: allocation %d",
491 output_info.allocation_index);
492 }
493 if (maybe_owning_memory && maybe_owning_memory->HasOwnership()) {
494 absl::optional<tensorflow::se::OwningDeviceMemory> owning =
495 maybe_owning_memory->Release();
496 // If the caller passes the ownership of the device memory, reuse it
497 // as the output buffer. It is up to the caller whether or not to
498 // donate a buffer; the aliasing information describes which buffers
499 // may alias, not buffers that must alias.
500 se::DeviceMemoryBase argument_buffer = owning->Release();
501 *maybe_owning_memory = argument_buffer;
502 result_buffer = argument_buffer;
503 // The caller is giving us the
504 // input buffer, but in case of error from the execute call, we should
505 // not be releasing it as it contains valid data (for example, it is a
506 // parameter which the user wants us to alias, in a gradient update
507 // computation). So we store the index into the result in the aliased
508 // vector, which will be fed to the ExecutionOutput, which will use
509 // the indices to drop the addresses from its own ScopedShapedBuffer
510 // result, if the ExecutionOutput is not committed.
511 result.AddAliasedIndex(index);
512 } else if (!output_info.passthrough) {
513 // The guard is above is not to insert copy-protection when aliasing
514 // pass-through params, as we do not need to write into the output
515 // buffer.
516 VLOG(3) << "Using copy-protection: aliasing is specified, but the "
517 "buffer is not donated; allocating a fresh buffer";
518 int64 allocation_size =
519 ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(output_shape_, index));
520 TF_ASSIGN_OR_RETURN(
521 se::OwningDeviceMemory allocated_buffer,
522 memory_allocator->Allocate(device_ordinal, allocation_size));
523 result_buffer = allocated_buffer.Release();
524 se::DeviceMemoryBase& aliased_buffer =
525 buffer_allocations.GetMutableDeviceAddress(
526 output_info.allocation_index);
527 CHECK_EQ(aliased_buffer.size(), result_buffer.size());
528 run_options->stream()->ThenMemcpyD2D(&result_buffer, aliased_buffer,
529 aliased_buffer.size());
530 aliased_buffer = result_buffer;
531 }
532 }
533
534 if (result_buffer.is_null()) {
535 // The source instruction should have a non-parameter buffer
536 // assigned.
537 result_buffer =
538 buffer_allocations.GetDeviceAddress(output_info.allocation_index);
539
540 // If the entire tuple contents is aliased, the copy insertion will *not*
541 // materialize a new tuple, so we mark it as aliased as well.
542 if (is_entire_tuple_contents_aliased) {
543 result.AddAliasedIndex(index);
544 }
545 }
546 buffers_in_result.insert(result_buffer);
547 }
548
549 for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
550 TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
551 }
552 TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations,
553 block_host_until_done,
554 hlo_execution_profile));
555
556 // Free all temporary allocations.
557 TF_RETURN_IF_ERROR(
558 buffer_allocations.TearDown(buffers_in_result, allocations_));
559
560 // Free allocations for arguments.
561 if (auto args = absl::get_if<absl::Span<ExecutionInput>>(&arguments)) {
562 MarkToBeReleasedArguments(*args, result);
563 }
564 return std::move(result);
565 }
566
SizeOfGeneratedCodeInBytes() const567 int64 GpuExecutable::SizeOfGeneratedCodeInBytes() const {
568 // Non-empty PTX but empty cubin: compilation must have failed, return
569 // "unknown".
570 if (binary().empty() && !text_.empty()) {
571 return -1;
572 }
573 int64 size = binary().size();
574 for (BufferAllocation::Index i = 0; i < allocations_.size(); ++i) {
575 const BufferAllocation& allocation = allocations_[i];
576 if (allocation.is_constant()) {
577 size += allocation.size();
578 }
579 }
580 return size;
581 }
582
583 StatusOr<absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>>
GetOutputInfo(const HloModule & hlo_module,const BufferAssignment & assignment)584 GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
585 const HloInstruction* root =
586 hlo_module.entry_computation()->root_instruction();
587
588 InstructionValueSet root_value_set =
589 assignment.dataflow_analysis().GetInstructionValueSet(root);
590
591 if (root_value_set.IsAmbiguous()) {
592 return Unimplemented("Points-to set of root instruction is ambiguous");
593 }
594
595 using OutputInfoMap =
596 absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
597 OutputInfoMap output;
598 TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
599 root->shape(),
600 [&](const Shape& /*sub_shape*/, const ShapeIndex& index) -> Status {
601 const auto& sources = root_value_set.element(index);
602 // The points-to set is unambiguous so the set should be a
603 // singleton. That is, we know exactly which instruction
604 // produced the array at this element.
605 CHECK_EQ(1, sources.values().size());
606 HloInstruction* src_hlo = sources.values()[0]->instruction();
607
608 GpuExecutable::OutputInfo& info = output[index];
609 info.passthrough = src_hlo->opcode() == HloOpcode::kParameter;
610 TF_ASSIGN_OR_RETURN(
611 const BufferAllocation::Slice slice,
612 assignment.GetUniqueSlice(src_hlo, sources.values()[0]->index()));
613 CHECK_EQ(slice.offset(), 0) << "Parameter should get its own slice";
614 info.allocation_index = slice.index();
615
616 output[index].alias_config =
617 hlo_module.input_output_alias_config().GetAliasedParameter(index);
618
619 return Status::OK();
620 }));
621 return output;
622 }
623
624 } // namespace gpu
625 } // namespace xla
626