• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
17 
18 #include <stdint.h>
19 
20 #include <algorithm>
21 #include <set>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25 
26 #include "absl/strings/str_cat.h"
27 #include "absl/strings/str_format.h"
28 #include "absl/strings/str_join.h"
29 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
30 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
31 #include "tensorflow/compiler/xla/service/computation_layout.h"
32 #include "tensorflow/compiler/xla/service/hlo_computation.h"
33 #include "tensorflow/compiler/xla/service/hlo_module.h"
34 #include "tensorflow/compiler/xla/service/logical_buffer.h"
35 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
36 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
37 #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
38 #include "tensorflow/compiler/xla/shape_tree.h"
39 #include "tensorflow/compiler/xla/shape_util.h"
40 #include "tensorflow/compiler/xla/status_macros.h"
41 #include "tensorflow/compiler/xla/types.h"
42 #include "tensorflow/compiler/xla/util.h"
43 #include "tensorflow/compiler/xla/xla_data.pb.h"
44 #include "tensorflow/core/platform/env.h"
45 #include "tensorflow/core/platform/logging.h"
46 #include "tensorflow/core/platform/mem.h"
47 #include "tensorflow/core/platform/mutex.h"
48 #include "tensorflow/core/platform/types.h"
49 #include "tensorflow/stream_executor/device_memory_allocator.h"
50 #include "tensorflow/stream_executor/host/host_stream.h"
51 
52 namespace xla {
53 namespace cpu {
54 
CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,std::unique_ptr<const BufferAssignment> assignment,std::unique_ptr<HloModule> hlo_module,const string & entry_function_name,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)55 CpuExecutable::CpuExecutable(
56     std::unique_ptr<SimpleOrcJIT> jit,
57     std::unique_ptr<const BufferAssignment> assignment,
58     std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
59     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
60     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
61     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
62                  std::move(hlo_profile_index_map)),
63       jit_(std::move(jit)),
64       assignment_(std::move(assignment)),
65       module_name_(entry_function_name) {
66   if (assignment_) {
67     buffer_assignment_.reset(new BufferAssignmentProto(assignment_->ToProto()));
68   }
69   XlaDebugInfoManager::Get()->RegisterModule(module_name_, shared_module(),
70                                              buffer_assignment_);
71 
72   // Resolve symbols in the constructor rather than at execution time to avoid
73   // races because FindSymbol is not thread safe.
74   llvm::Expected<llvm::JITEvaluatedSymbol> sym =
75       jit_->FindCompiledSymbol(entry_function_name);
76   // We expect to find the symbol provided with entry_function_name; otherwise
77   // this is an internal error.
78   CHECK(*sym) << "Symbol " << entry_function_name << " not found.";
79   // getAddress can do work under the hood in the jit, so it needs to be
80   // guarded by the mutex.
81   compute_function_ = reinterpret_cast<ComputeFunctionType>(sym->getAddress());
82   VLOG(1) << "compute_function_ at address "
83           << reinterpret_cast<void*>(compute_function_);
84   jit_->DoneCompiling();
85 }
86 
~CpuExecutable()87 CpuExecutable::~CpuExecutable() {
88   XlaDebugInfoManager::Get()->UnregisterModule(module_name_, shared_module(),
89                                                buffer_assignment_);
90 }
91 
MemoryForAllocation(const BufferAllocation & allocation,absl::Span<ExecutionInput const> arguments,se::DeviceMemoryAllocator * memory_allocator,int device_ordinal)92 static StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
93     const BufferAllocation& allocation,
94     absl::Span<ExecutionInput const> arguments,
95     se::DeviceMemoryAllocator* memory_allocator, int device_ordinal) {
96   VLOG(3) << allocation.ToString();
97   if (allocation.is_entry_computation_parameter()) {
98     se::DeviceMemoryBase out = arguments[allocation.parameter_number()]
99                                    .Buffer(allocation.param_shape_index())
100                                    .AsDeviceMemoryBase();
101     CHECK_EQ(allocation.size(), out.size())
102         << "Size mismatch on param " << allocation.parameter_number()
103         << " at shape index " << allocation.param_shape_index().ToString();
104     VLOG(3) << "allocation is a parameter";
105     return MaybeOwningDeviceMemory{out};
106   } else if (allocation.is_constant()) {
107     VLOG(3) << "allocation is a constant";
108     return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
109   } else if (allocation.is_thread_local()) {
110     VLOG(3) << "buffer is thread-local";
111     return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
112   }
113 
114   int64_t buffer_size = allocation.size();
115   TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory out,
116                       memory_allocator->Allocate(device_ordinal, buffer_size));
117   VLOG(3) << "buffer allocated " << buffer_size << " bytes [" << out->opaque()
118           << "]";
119 
120   // Since the output buffer and all the temporary buffers were written into
121   // by the JITed code, msan has no way of knowing their memory was
122   // initialized. Mark them initialized so that msan doesn't flag loads from
123   // these buffers.
124   TF_ANNOTATE_MEMORY_IS_INITIALIZED(out->opaque(), buffer_size);
125   return MaybeOwningDeviceMemory{std::move(out)};
126 }
127 
CreateBufferTable(se::DeviceMemoryAllocator * memory_allocator,int device_ordinal,absl::Span<ExecutionInput const> arguments)128 StatusOr<std::vector<MaybeOwningDeviceMemory>> CpuExecutable::CreateBufferTable(
129     se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
130     absl::Span<ExecutionInput const> arguments) {
131   std::vector<MaybeOwningDeviceMemory> buffers(
132       assignment_->Allocations().size());
133   VLOG(3) << "Allocating " << assignment_->Allocations().size()
134           << " allocations for module " << module().name();
135   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
136        ++i) {
137     const BufferAllocation& allocation = assignment_->GetAllocation(i);
138     TF_ASSIGN_OR_RETURN(
139         buffers[i], MemoryForAllocation(allocation, arguments, memory_allocator,
140                                         device_ordinal));
141   }
142 
143   if (VLOG_IS_ON(3)) {
144     TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
145                         assignment_->GetUniqueTopLevelOutputSlice());
146     VLOG(3) << "result index: " << result_slice.index();
147   }
148   return std::move(buffers);
149 }
150 
ExecuteComputeFunction(const ExecutableRunOptions * run_options,absl::Span<MaybeOwningDeviceMemory const> buffers,HloExecutionProfile * hlo_execution_profile)151 Status CpuExecutable::ExecuteComputeFunction(
152     const ExecutableRunOptions* run_options,
153     absl::Span<MaybeOwningDeviceMemory const> buffers,
154     HloExecutionProfile* hlo_execution_profile) {
155   // The calling convention for JITed functions is:
156   //
157   //  void function(void* result, const void* run_options, void** args_array,
158   //                void** buffer_table)
159   //
160   // result: Points at the result.
161   // run_options: the ExecutableRunOptions object.
162   // args_array: null
163   // buffer_table: An array of pointers, containing pointers to temporary
164   //   buffers required by the executable adn pointers to entry computation
165   //   parameters.
166   //
167 
168   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
169 
170   XlaDebugInfoManager::Get()->OnModuleStart(module_name_);
171   auto cleanup = MakeCleanup(
172       [&]() { XlaDebugInfoManager::Get()->OnModuleStop(module_name_); });
173 
174   size_t profile_counters_size =
175       hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
176                             : 0;
177   int64* profile_counters =
178       hlo_execution_profile
179           ? hlo_execution_profile->mutable_profile_counters()->data()
180           : nullptr;
181 
182   // Call the computation function following the calling convention.
183   std::vector<void*> buffer_pointers;
184   for (auto& buffer : buffers) {
185     buffer_pointers.push_back(
186         const_cast<void*>(buffer.AsDeviceMemoryBase().opaque()));
187   }
188   TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
189                       assignment_->GetUniqueTopLevelOutputSlice());
190   void* result_buffer = buffer_pointers[result_slice.index()];
191   if (VLOG_IS_ON(3)) {
192     VLOG(3) << "Executing compute function:";
193     VLOG(3) << absl::StrFormat(
194         "  func(void* result, void* params[null], void* buffer_table[%u], "
195         "uint64 profile_counters[%u])",
196         buffer_pointers.size(), profile_counters_size);
197     VLOG(3) << absl::StrFormat("    result = %p", result_buffer);
198     auto ptr_printer = [](string* out, const void* p) {
199       absl::StrAppend(out, absl::StrFormat("%p", p));
200     };
201     VLOG(3) << "    params = nullptr";
202     VLOG(3) << absl::StrFormat(
203         "    buffer_table = [%s]",
204         absl::StrJoin(buffer_pointers, ", ", ptr_printer));
205     VLOG(3) << absl::StrFormat("    profile_counters = %p", profile_counters);
206   }
207 
208   compute_function_(result_buffer, run_options, nullptr, buffer_pointers.data(),
209                     profile_counters);
210 
211   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
212 
213   if (run_options->execution_profile()) {
214     const double nanoseconds = (end_micros - start_micros) * 1000.0;
215     run_options->execution_profile()->set_compute_time_ns(
216         std::max(nanoseconds, 1.0));
217     // If hlo profiling was disabled then the cycle count is left empty.
218     if (hlo_execution_profile) {
219       run_options->execution_profile()->set_compute_cycle_count(
220           hlo_execution_profile->total_cycles_executed(
221               *module().entry_computation()));
222     }
223   }
224 
225   return Status::OK();
226 }
227 
CreateResultShapedBuffer(const ServiceExecutableRunOptions * run_options,absl::Span<MaybeOwningDeviceMemory> buffers,absl::Span<ExecutionInput> arguments)228 StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
229     const ServiceExecutableRunOptions* run_options,
230     absl::Span<MaybeOwningDeviceMemory> buffers,
231     absl::Span<ExecutionInput> arguments) {
232   se::Stream* stream = run_options->stream();
233   ExecutionOutput result(/*on_device_shape=*/result_shape(),
234                          run_options->allocator(),
235                          stream->parent()->device_ordinal());
236   const HloInputOutputAliasConfig& input_output_alias =
237       module().input_output_alias_config();
238   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
239   const Shape& root_shape = root->shape();
240 
241   // Move se::OwningDeviceMemory values which contain the array(s) of the result
242   // into the respective location in ScopedShapedBuffer which is returned to the
243   // caller.
244   for (auto& p : result.MutableResult()->buffers()) {
245     const ShapeIndex& index = p.first;
246     se::DeviceMemoryBase& result_buffer = p.second;
247     const HloValueSet& sources = this->GetRootValueSet().element(index);
248     // The points to set is unambiguous so the set should be a
249     // singleton.
250     CHECK_EQ(1, sources.values().size());
251     const HloValue* value_source = sources.values()[0];
252     HloInstruction* src = value_source->instruction();
253 
254     // The source for this result buffer can be a nested buffer such as
255     // a tuple element.
256     TF_ASSIGN_OR_RETURN(
257         const BufferAllocation::Slice slice,
258         this->assignment_->GetUniqueSlice(src, value_source->index()));
259     const BufferAllocation::Index buffer_index = slice.index();
260 
261     // TODO(cheshire): duplication with other backends.
262     absl::optional<HloInputOutputAliasConfig::Alias> alias =
263         input_output_alias.GetAliasedParameter(index);
264     if (alias) {
265       CHECK_LT(alias->parameter_number, arguments.size());
266       ExecutionInput& input = arguments[alias->parameter_number];
267       MaybeOwningDeviceMemory* maybe_owning_memory =
268           input.MutableBuffer(alias->parameter_index);
269       if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
270         return InvalidArgument(
271             "An input was configured to be must-alias at "
272             "compile time but not donated at runtime: %s",
273             alias->ToString());
274       }
275       if (absl::optional<se::OwningDeviceMemory> owning =
276               maybe_owning_memory->Release()) {
277         // If the caller passes the ownership of the device memory, reuse it
278         // as the output buffer. It is up to the caller whether or not to
279         // donate a buffer; the aliasing information describes which buffers
280         // may alias, not buffers that must alias.
281         se::DeviceMemoryBase argument_buffer = owning->Release();
282         *maybe_owning_memory = argument_buffer;
283         result_buffer = argument_buffer;
284         // The caller is giving us the
285         // input buffer, but in case of error of the execute call, we should
286         // not be releasing it as it contains valid data (for example, it is a
287         // parameter which the user wants us to alias, in a gradient update
288         // computation). So we store the index into the result in the aliased
289         // vactor, which will be fed to the ExecutionOutput, which will be
290         // using the indices to drop the addresses from its own
291         // ScopedShapedBuffer result, if the ExecutionOutput is not committed.
292         result.AddAliasedIndex(index);
293       } else {
294         VLOG(3) << "Using copy-protection: aliasing is specified, but the "
295                    "buffer is not donated; allocating a fresh buffer";
296         int64_t allocation_size =
297             ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(root_shape, index));
298         TF_ASSIGN_OR_RETURN(
299             se::OwningDeviceMemory allocated_buffer,
300             run_options->allocator()->Allocate(
301                 stream->parent()->device_ordinal(), allocation_size));
302         result_buffer = allocated_buffer.Release();
303         MaybeOwningDeviceMemory& registered_buffer = buffers[buffer_index];
304         CHECK_EQ(result_buffer.size(),
305                  registered_buffer.AsDeviceMemoryBase().size());
306         std::memcpy(/*dest=*/result_buffer.opaque(),
307                     /*src=*/registered_buffer.AsDeviceMemoryBase().opaque(),
308                     /*n=*/result_buffer.size());
309         registered_buffer = result_buffer;
310       }
311     }
312 
313     if (result_buffer.is_null()) {
314       MaybeOwningDeviceMemory& buffer = buffers[buffer_index];
315       if (absl::optional<se::OwningDeviceMemory> owned_buffer =
316               buffer.Release()) {
317         result_buffer = owned_buffer->Release();
318         buffer = result_buffer;
319       } else {
320         result_buffer = buffer.AsDeviceMemoryBase();
321         result.AddAliasedIndex(index);
322       }
323     }
324   }
325   return std::move(result);
326 }
327 
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,std::vector<ExecutionInput> arguments,HloExecutionProfile * hlo_execution_profile)328 StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
329     const ServiceExecutableRunOptions* run_options,
330     std::vector<ExecutionInput> arguments,
331     HloExecutionProfile* hlo_execution_profile) {
332   if (GetRootValueSet().IsAmbiguous()) {
333     return Unimplemented("Points-to set of root instruction is ambiguous");
334   }
335 
336   if (hlo_module_) {
337     const HloComputation* entry_comp = hlo_module_->entry_computation();
338     CHECK_EQ(entry_comp->num_parameters(), arguments.size())
339         << "Wrong number of arguments passed when running executable";
340     for (int64_t i = 0; i < entry_comp->num_parameters(); ++i) {
341       const Shape& expected_shape =
342           entry_comp->parameter_instruction(i)->shape();
343       const Shape& actual_shape = arguments[i].Buffers().shape();
344       TF_RET_CHECK(
345           ShapeUtil::DynamicShapeIsCompatible(actual_shape, expected_shape))
346           << "Shape mismatch on argument " << i << ", "
347           << expected_shape.ToString(/*print_layout=*/true) << " vs. "
348           << actual_shape.ToString(/*print_layout=*/true);
349     }
350   }
351 
352   auto* host_stream = dynamic_cast<se::host::HostStream*>(
353       run_options->stream()->implementation());
354   se::Stream* stream = run_options->stream();
355   se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
356   TF_ASSIGN_OR_RETURN(
357       std::vector<MaybeOwningDeviceMemory> buffers,
358       CreateBufferTable(memory_allocator, stream->parent()->device_ordinal(),
359                         arguments));
360 
361   TF_ASSIGN_OR_RETURN(
362       ExecutionOutput result,
363       CreateResultShapedBuffer(run_options, absl::MakeSpan(buffers),
364                                absl::MakeSpan(arguments)));
365 
366   // Logically we want this lambda to capture `buffers` by move, ultimately our
367   // functor needs to be wrapped in an std::function, and that requires its
368   // functor to be copyable.  Thus we perpetrate the hack of capturing buffers
369   // "by shared pointer".
370   //
371   // We also need to change the types of some of the variables we capture:
372   // run_options needs to change from a pointer to a value type, and arguments
373   // needs to change from a Span into a vector.  We use a struct instead
374   // of a lambda to make this explicit.
375   struct AsyncRunTask {
376     CpuExecutable* executable;
377     ServiceExecutableRunOptions run_options;
378     std::shared_ptr<std::vector<MaybeOwningDeviceMemory>> task_buffers;
379     HloExecutionProfile* hlo_execution_profile;
380 
381     void operator()() {
382       // Failing a CHECK here is not great, but I don't see an obvious way to
383       // return a failed Status asynchronously.
384       TF_CHECK_OK(executable->ExecuteComputeFunction(
385           &run_options.run_options(), *task_buffers, hlo_execution_profile));
386     }
387   };
388   host_stream->EnqueueTask(
389       AsyncRunTask{this, *run_options,
390                    std::make_shared<std::vector<MaybeOwningDeviceMemory>>(
391                        std::move(buffers)),
392                    hlo_execution_profile});
393 
394   MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
395   return std::move(result);
396 }
397 
ShapeSizeBytes(const Shape & shape)398 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
399   // On the cpu, opaques are pointers.
400   if (shape.IsOpaque()) {
401     return sizeof(void*);
402   }
403   if (shape.is_static() || shape.IsTuple()) {
404     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
405   }
406   // Each dynamic dimension size is represented as a S32.
407   int64_t metadata_size = sizeof(int32) * shape.dimensions_size();
408   return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
409 }
410 
GetRootValueSet() const411 const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
412   return assignment_->dataflow_analysis().GetInstructionValueSet(
413       module().entry_computation()->root_instruction());
414 }
415 
SizeOfGeneratedCodeInBytes() const416 int64 CpuExecutable::SizeOfGeneratedCodeInBytes() const {
417   return jit_->SizeOfGeneratedCodeInBytes();
418 }
419 
420 }  // namespace cpu
421 }  // namespace xla
422