• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
17 
18 #include <set>
19 #include <utility>
20 #include <vector>
21 
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/memory/memory.h"
24 #include "tensorflow/compiler/xla/map_util.h"
25 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
26 #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
27 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
28 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
29 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
30 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
31 #include "tensorflow/compiler/xla/service/logical_buffer.h"
32 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
33 #include "tensorflow/compiler/xla/service/transfer_manager.h"
34 #include "tensorflow/compiler/xla/shape_tree.h"
35 #include "tensorflow/compiler/xla/shape_util.h"
36 #include "tensorflow/compiler/xla/status_macros.h"
37 #include "tensorflow/compiler/xla/util.h"
38 #include "tensorflow/core/platform/logging.h"
39 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
40 #include "tensorflow/core/profiler/lib/traceme.h"
41 #include "tensorflow/stream_executor/platform.h"
42 
43 namespace xla {
44 namespace gpu {
45 namespace {
46 
47 using tensorflow::profiler::ScopedAnnotation;
48 
49 }  // namespace
50 
51 // Implementation note: HLO profiling is always enabled for GPU executables,
52 // since we can use timers around thunks.
GpuExecutable(const string & text,const std::vector<uint8> & binary,GpuVersion gpu_version,std::unique_ptr<const ThunkSchedule> thunk_schedule,std::shared_ptr<HloModule> hlo_module,std::shared_ptr<const BufferAssignment> assignment,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)53 GpuExecutable::GpuExecutable(
54     const string& text, const std::vector<uint8>& binary,
55     GpuVersion gpu_version, std::unique_ptr<const ThunkSchedule> thunk_schedule,
56     std::shared_ptr<HloModule> hlo_module,
57     std::shared_ptr<const BufferAssignment> assignment,
58     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
59     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
60     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
61                  std::move(hlo_profile_index_map)),
62       text_(text),
63       binary_(binary),
64       gpu_version_(gpu_version),
65       thunk_schedule_(std::move(thunk_schedule)),
66       assignment_(std::move(assignment)) {
67   CHECK(has_module() && assignment_);
68   GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
69                                              assignment_);
70   ComputeThunkAnnotations();
71 }
72 
~GpuExecutable()73 GpuExecutable::~GpuExecutable() {
74   CHECK(has_module() && assignment_);
75   GpuDebugInfoManager::Get()->UnregisterModule(module().name(), shared_module(),
76                                                assignment_);
77 
78   {
79     // We could have issued host->device mem copies in ResolveConstantGlobals.
80     // Wait for those to finish so that we can safely deallocate the backing HLO
81     // module.
82     //
83     // We need for the host->device memcpies to finish they are concurrently
84     // reading memory (xla::Literal's) owned by the HLO module.
85     tensorflow::mutex_lock lock(module_handle_mutex_);
86     for (const auto& pair : module_globals_) {
87       CHECK(pair.first->SynchronizeAllActivity());
88     }
89   }
90 }
91 
ComputeThunkAnnotations()92 void GpuExecutable::ComputeThunkAnnotations() {
93   CanonicalNameMap canonical_name_map;
94   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
95     const HloInstruction* hlo = thunk->hlo_instruction();
96     CHECK(hlo);
97     thunk_annotations_[thunk] =
98         absl::StrFormat("%s:#hlo_op=%s,hlo_module=%s#",
99                         hlo->ToStringWithCanonicalNameMap(
100                             HloPrintOptions::Canonical(), &canonical_name_map),
101                         hlo->name(), hlo->GetModule()->name());
102   }
103 }
104 
CheckCompatibilityWithServiceExecutableRunOptions(const ServiceExecutableRunOptions * run_options)105 Status GpuExecutable::CheckCompatibilityWithServiceExecutableRunOptions(
106     const ServiceExecutableRunOptions* run_options) {
107   se::Stream* main_stream = run_options->stream();
108 
109   stream_executor::PlatformKind platform_kind =
110       main_stream->parent()->platform_kind();
111   if (platform_kind == stream_executor::PlatformKind::kROCm) {
112     int stream_isa_version;
113     main_stream->parent()->GetDeviceDescription().rocm_amdgpu_isa_version(
114         &stream_isa_version);
115     GpuVersion amd_isa_version = stream_isa_version;
116     TF_RET_CHECK(amd_isa_version == gpu_version_)
117         << "AMDGPU GCN ISA version mismatch; expected {"
118         << absl::get<int>(gpu_version_) << ", but was " << stream_isa_version;
119   } else if (platform_kind == stream_executor::PlatformKind::kCuda) {
120     std::pair<int, int> stream_compute_compatibility;
121     main_stream->parent()->GetDeviceDescription().cuda_compute_capability(
122         &stream_compute_compatibility.first,
123         &stream_compute_compatibility.second);
124     GpuVersion nvidia_compute_compatibility = stream_compute_compatibility;
125     TF_RET_CHECK(nvidia_compute_compatibility == gpu_version_)
126         << "Compute capability mismatch; expected {"
127         << absl::get<std::pair<int, int>>(gpu_version_).first << ", "
128         << absl::get<std::pair<int, int>>(gpu_version_).second << "}, but was {"
129         << stream_compute_compatibility.first << ", "
130         << stream_compute_compatibility.second << "}";
131   } else {
132     return InternalError("Unknown platform: %d", platform_kind);
133   }
134 
135   return Status::OK();
136 }
137 
ExecuteThunks(const ServiceExecutableRunOptions * run_options,const BufferAllocations & buffer_allocations,bool block_host_until_done,HloExecutionProfile * hlo_execution_profile)138 Status GpuExecutable::ExecuteThunks(
139     const ServiceExecutableRunOptions* run_options,
140     const BufferAllocations& buffer_allocations, bool block_host_until_done,
141     HloExecutionProfile* hlo_execution_profile) {
142   TF_RETURN_IF_ERROR(
143       CheckCompatibilityWithServiceExecutableRunOptions(run_options));
144   GpuDebugInfoManager::Get()->OnModuleStart(module().name());
145   auto cleanup = MakeCleanup(
146       [&]() { GpuDebugInfoManager::Get()->OnModuleStop(module().name()); });
147 
148   se::Stream* main_stream = run_options->stream();
149   se::StreamExecutor* executor = main_stream->parent();
150 
151   bool do_profile = hlo_execution_profile != nullptr;
152   if (do_profile) {
153     LOG(WARNING) << "PROFILING: profiling is enabled";
154   }
155 
156   // Stream 0 indicates `main_stream` and substreams start from stream 1.
157   std::vector<StreamPool::Ptr> sub_streams;
158   sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
159   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
160     sub_streams.emplace_back();
161     TF_ASSIGN_OR_RETURN(sub_streams.back(),
162                         run_options->BorrowStream(executor->device_ordinal()));
163   }
164 
165   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
166                                 sub_streams, hlo_module_->entry_computation());
167   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
168 
169   tensorflow::profiler::TraceMe hlo_module_activity(
170       [&] { return absl::StrCat(hlo_module_->name(), ":XLA GPU module"); },
171       tensorflow::profiler::TraceMeLevel::kInfo);
172 
173   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
174   bool scoped_annotation_enabled = ScopedAnnotation::IsEnabled();
175   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
176     // Annotate execution of this op if tracing was enabled when we started
177     // running this module.  If tracing is enabled *while* we're running the
178     // module, we won't get any data, but that's probably an OK trade-off.
179     absl::optional<ScopedAnnotation> op_annotation;
180     CHECK(thunk->hlo_instruction());
181     if (scoped_annotation_enabled) {
182       op_annotation.emplace(FindOrDie(thunk_annotations_, thunk));
183     }
184 
185     TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
186     int32 stream_no =
187         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
188     se::Stream* stream =
189         (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get());
190 
191     for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) {
192       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
193     }
194 
195     VLOG(2) << "Executing the thunk for "
196             << thunk->hlo_instruction()->ToString() << " on stream "
197             << stream_no;
198     Thunk::ExecuteParams thunk_params{
199         &buffer_allocations, stream, run_options->run_options().run_id(),
200         &profiler, run_options->run_options().device_assignment()};
201     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(thunk_params));
202     if (thunk_schedule_->Depended(thunk)) {
203       auto finish_event = absl::make_unique<se::Event>(main_stream->parent());
204       finish_event->Init();
205       stream->ThenRecordEvent(finish_event.get());
206       thunk_to_finish_event[thunk] = std::move(finish_event);
207     }
208   }
209 
210   main_stream->ThenWaitFor(&sub_streams);
211   // Make sure kernels are completed before deallocating temporary buffers or
212   // the profiler state.
213   // TODO(b/30100571): we could potentially postpone deallocating the temp
214   // buffers until a different computation is executed.
215   if (do_profile || block_host_until_done) {
216     Status block_status = main_stream->BlockHostUntilDone();
217     if (!block_status.ok()) {
218       return InternalError(
219           "Failed to complete all kernels launched on stream %p: %s",
220           main_stream, block_status.error_message());
221     }
222   }
223 
224   // FinishExecution() blocks until main_stream has completed if profiling is
225   // enabled; we therefore do not need to defer profile collection onto a
226   // stream.
227   profiler.FinishExecution();
228   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
229 
230   if (run_options->run_options().execution_profile()) {
231     ExecutionProfile* profile = run_options->run_options().execution_profile();
232     const double nanoseconds = (end_micros - start_micros) * 1000.0;
233     profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
234 
235     // If hlo profiling was disabled then the cycle count is left empty.
236     if (do_profile) {
237       profile->set_compute_cycle_count(
238           hlo_execution_profile->total_cycles_executed(
239               *module().entry_computation()));
240     }
241   }
242 
243   return Status::OK();
244 }
245 
246 StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*>
ResolveConstantGlobals(se::Stream * stream)247 GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
248   se::StreamExecutor* executor = stream->parent();
249 
250   tensorflow::mutex_lock lock(module_handle_mutex_);
251   auto it = module_globals_.find(executor);
252   if (it != module_globals_.end()) {
253     return &it->second;
254   }
255 
256   se::MultiModuleLoaderSpec module_spec;
257   if (!binary().empty()) {
258     module_spec.AddCudaCubinInMemory(binary());
259   }
260   module_spec.AddCudaPtxInMemory(text().c_str());
261 
262   absl::flat_hash_map<int64, se::DeviceMemoryBase> globals;
263   if (executor->platform_kind() == se::PlatformKind::kCuda &&
264       module_spec.cuda_ptx_in_memory() == nullptr) {
265     // No custom PTX => no globals.
266     return &module_globals_.emplace(executor, std::move(globals)).first->second;
267   }
268 
269   se::ModuleHandle module_handle;
270   TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
271 
272   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
273        ++i) {
274     const BufferAllocation& allocation = assignment_->GetAllocation(i);
275     if (allocation.is_constant()) {
276       TF_ASSIGN_OR_RETURN(
277           se::DeviceMemoryBase global,
278           executor->GetUntypedSymbol(
279               llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
280               module_handle));
281       VLOG(3) << "Resolved global "
282               << llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
283               << " to " << global.opaque();
284       InsertOrDie(&globals, i, global);
285 
286       const Literal& literal =
287           llvm_ir::LiteralForConstantAllocation(allocation);
288       CHECK(literal.shape().IsArray());
289       if (!ShouldEmitLiteralInLlvmIr(literal)) {
290         VLOG(3) << "H2D memcpy for constant with shape "
291                 << ShapeUtil::HumanString(literal.shape());
292         stream->ThenMemcpy(&global, literal.untyped_data(), allocation.size());
293       }
294     }
295   }
296 
297   module_handles_.emplace(executor,
298                           se::ScopedModuleHandle(executor, module_handle));
299   return &module_globals_.emplace(executor, std::move(globals)).first->second;
300 }
301 
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments,HloExecutionProfile * hlo_execution_profile)302 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
303     const ServiceExecutableRunOptions* run_options,
304     std::vector<ShapeTree<MaybeOwningDeviceMemory>> arguments,
305     HloExecutionProfile* hlo_execution_profile) {
306   se::DeviceMemoryAllocator* const memory_allocator = run_options->allocator();
307   // Force synchronous execution if the allocator requires it.
308   const bool block_host_until_done =
309       !memory_allocator->AllowsAsynchronousDeallocation();
310 
311   if (GetRootValueSet().IsAmbiguous()) {
312     return Unimplemented("Points-to set of root instruction is ambiguous");
313   }
314 
315   BufferAllocations::Builder buffer_allocations_builder;
316   const GpuExecutable::BufferAllocToDeviceMemoryMap* globals;
317   {
318     tensorflow::profiler::TraceMe hlo_module_activity(
319         [&] { return std::string("Resolve constant globals"); },
320         tensorflow::profiler::TraceMeLevel::kInfo);
321 
322     TF_ASSIGN_OR_RETURN(globals, ResolveConstantGlobals(run_options->stream()));
323   }
324 
325   se::StreamExecutor* executor = run_options->stream()->parent();
326 
327   std::unique_ptr<BufferAllocations> buffer_allocations;
328 
329   {
330     tensorflow::profiler::TraceMe hlo_module_activity(
331         [&] { return std::string("Build buffer allocations"); },
332         tensorflow::profiler::TraceMeLevel::kInfo);
333 
334     for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
335          ++i) {
336       const BufferAllocation& allocation = assignment_->GetAllocation(i);
337       if (allocation.is_entry_computation_parameter()) {
338         auto param_no = allocation.parameter_number();
339         se::DeviceMemoryBase buffer =
340             arguments[param_no]
341                 .element(allocation.param_shape_index())
342                 .AsDeviceMemoryBase();
343 
344         // All top-level buffers and sub-buffers must have an explicit, non-null
345         // pointer, except for zero-sized buffers, which may be null.
346         if (buffer.is_null() && buffer.size() > 0) {
347           return FailedPrecondition(
348               "Cannot run XLA computation because pointer to (sub-)buffer at "
349               "index %s of parameter %d was null.  All pointers to "
350               "(sub-)buffers must not be null, unless the (sub-)buffer has "
351               "zero elements.",
352               allocation.param_shape_index().ToString(), param_no);
353         }
354 
355         buffer_allocations_builder.RegisterBuffer(i, buffer);
356       }
357 
358       if (allocation.is_constant()) {
359         buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i));
360       }
361     }
362 
363     TF_ASSIGN_OR_RETURN(
364         buffer_allocations,
365         buffer_allocations_builder.Build(
366             assignment_.get(), executor->device_ordinal(), memory_allocator));
367   }
368 
369   TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
370                                    block_host_until_done,
371                                    hlo_execution_profile));
372 
373   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
374   auto device_ordinal = executor->device_ordinal();
375   ScopedShapedBuffer shaped_buffer(root->shape(), root->shape(),
376                                    memory_allocator, device_ordinal);
377 
378   // Copy DeviceMemoryBase values which contain the array(s) of the result into
379   // the respective location in ShapedBuffer.
380   std::set<se::DeviceMemoryBase> buffers_in_result;
381   TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus(
382       [&buffer_allocations, &buffers_in_result, this](
383           const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
384         const auto& sources = this->GetRootValueSet().element(index);
385         // The points-to set is unambiguous so the set should be a
386         // singleton. That is, we know exactly which instruction
387         // produced the array at this element.
388         CHECK_EQ(1, sources.values().size());
389         auto src_hlo = sources.values()[0]->instruction();
390 
391         VLOG(4) << "Looking at: " << sources.values()[0];
392 
393         // The source instruction should have a non-parameter buffer
394         // assigned.
395         TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice slice,
396                             this->assignment_->GetUniqueSlice(
397                                 src_hlo, sources.values()[0]->index()));
398 
399         se::DeviceMemoryBase src_base =
400             buffer_allocations->GetDeviceAddress(slice.index());
401         CHECK(!src_base.is_null() || src_base.size() == 0);
402         if (!slice.allocation()->is_entry_computation_parameter()) {
403           // If the buffer coming out of the result is from a parameter, it
404           // means the caller aliased some parameter buffer to an output one
405           // (via the HloInputOutputAliasConfig API). If that is the case, the
406           // caller will receive a partially complete scoped shaped buffer,
407           // which they will have to fill up on return.
408           // Unfortunately the interface to the execute APIs are ShapedBuffer
409           // pointer based, which assumes caller ownership, and hence a buffer
410           // coming from there cannot be part of the new ScopedShapedBuffer we
411           // create for the result (which assumes ownership).
412           *device_memory = src_base;
413         } else {
414           const HloInputOutputAliasConfig& input_output_alias =
415               module().input_output_alias_config();
416           auto output_alias = input_output_alias.GetAliasedOutput(
417               slice.allocation()->parameter_number(),
418               slice.allocation()->param_shape_index());
419           CHECK(output_alias)
420               << "Output buffer is coming from parameter "
421               << slice.allocation()->parameter_number() << " at index "
422               << slice.allocation()->param_shape_index()
423               << ", but no alias exists";
424           CHECK_EQ(*output_alias, index);
425         }
426         buffers_in_result.insert(src_base);
427         return Status::OK();
428       }));
429   TF_RETURN_IF_ERROR(buffer_allocations->TearDown(buffers_in_result));
430 
431   std::vector<se::OwningDeviceMemory> buffers_to_free;
432   for (ShapeTree<MaybeOwningDeviceMemory>& argument : arguments) {
433     for (std::pair<ShapeIndex, MaybeOwningDeviceMemory>& buffer : argument) {
434       auto maybe_owning_buffer = buffer.second.Release();
435       if (maybe_owning_buffer) {
436         buffers_to_free.push_back(std::move(*maybe_owning_buffer));
437       }
438     }
439   }
440   return ExecutionOutput(std::move(shaped_buffer), std::move(buffers_to_free),
441                          {}, {});
442 }
443 
GetRootValueSet() const444 const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
445   return assignment_->dataflow_analysis().GetInstructionValueSet(
446       module().entry_computation()->root_instruction());
447 }
448 
SizeOfGeneratedCodeInBytes()449 int64 GpuExecutable::SizeOfGeneratedCodeInBytes() {
450   // Non-empty PTX but empty cubin: compilation must have failed, return
451   // "unknown".
452   if (binary().empty() && !text_.empty()) {
453     return -1;
454   }
455   return binary().size();
456 }
457 
458 }  // namespace gpu
459 }  // namespace xla
460