• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
17 
18 #include <stdint.h>
19 
20 #include <algorithm>
21 #include <set>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25 
26 #include "absl/strings/str_cat.h"
27 #include "absl/strings/str_format.h"
28 #include "absl/strings/str_join.h"
29 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
30 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
31 #include "tensorflow/compiler/xla/service/computation_layout.h"
32 #include "tensorflow/compiler/xla/service/hlo_computation.h"
33 #include "tensorflow/compiler/xla/service/hlo_module.h"
34 #include "tensorflow/compiler/xla/service/logical_buffer.h"
35 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
36 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
37 #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
38 #include "tensorflow/compiler/xla/shape_tree.h"
39 #include "tensorflow/compiler/xla/shape_util.h"
40 #include "tensorflow/compiler/xla/status_macros.h"
41 #include "tensorflow/compiler/xla/types.h"
42 #include "tensorflow/compiler/xla/util.h"
43 #include "tensorflow/compiler/xla/xla_data.pb.h"
44 #include "tensorflow/core/platform/env.h"
45 #include "tensorflow/core/platform/logging.h"
46 #include "tensorflow/core/platform/mem.h"
47 #include "tensorflow/core/platform/mutex.h"
48 #include "tensorflow/core/platform/types.h"
49 #include "tensorflow/stream_executor/device_memory_allocator.h"
50 #include "tensorflow/stream_executor/host/host_stream.h"
51 
52 namespace xla {
53 namespace cpu {
54 
CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,std::unique_ptr<const BufferAssignment> assignment,std::unique_ptr<HloModule> hlo_module,const string & entry_function_name,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)55 CpuExecutable::CpuExecutable(
56     std::unique_ptr<SimpleOrcJIT> jit,
57     std::unique_ptr<const BufferAssignment> assignment,
58     std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
59     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
60     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
61     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
62                  std::move(hlo_profile_index_map)),
63       jit_(std::move(jit)),
64       assignment_(std::move(assignment)),
65       module_name_(entry_function_name) {
66   if (assignment_) {
67     buffer_assignment_.reset(new BufferAssignmentProto(assignment_->ToProto()));
68   }
69   XlaDebugInfoManager::Get()->RegisterModule(module_name_, shared_module(),
70                                              buffer_assignment_);
71 
72   // Resolve symbols in the constructor rather than at execution time to avoid
73   // races because FindSymbol is not thread safe.
74   llvm::Expected<llvm::JITEvaluatedSymbol> sym =
75       jit_->FindCompiledSymbol(entry_function_name);
76   // We expect to find the symbol provided with entry_function_name; otherwise
77   // this is an internal error.
78   CHECK(*sym) << "Symbol " << entry_function_name << " not found.";
79   // getAddress can do work under the hood in the jit, so it needs to be
80   // guarded by the mutex.
81   compute_function_ = reinterpret_cast<ComputeFunctionType>(sym->getAddress());
82   VLOG(1) << "compute_function_ at address "
83           << reinterpret_cast<void*>(compute_function_);
84 }
85 
~CpuExecutable()86 CpuExecutable::~CpuExecutable() {
87   XlaDebugInfoManager::Get()->UnregisterModule(module_name_, shared_module(),
88                                                buffer_assignment_);
89 }
90 
MemoryForAllocation(const BufferAllocation & allocation,absl::Span<ExecutionInput const> arguments,se::DeviceMemoryAllocator * memory_allocator,int device_ordinal)91 static StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
92     const BufferAllocation& allocation,
93     absl::Span<ExecutionInput const> arguments,
94     se::DeviceMemoryAllocator* memory_allocator, int device_ordinal) {
95   VLOG(3) << allocation.ToString();
96   if (allocation.is_entry_computation_parameter()) {
97     se::DeviceMemoryBase out = arguments[allocation.parameter_number()]
98                                    .Buffer(allocation.param_shape_index())
99                                    .AsDeviceMemoryBase();
100     CHECK_EQ(allocation.size(), out.size())
101         << "Size mismatch on param " << allocation.parameter_number()
102         << " at shape index " << allocation.param_shape_index().ToString();
103     VLOG(3) << "allocation is a parameter";
104     return MaybeOwningDeviceMemory{out};
105   } else if (allocation.is_constant()) {
106     VLOG(3) << "allocation is a constant";
107     return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
108   } else if (allocation.is_thread_local()) {
109     VLOG(3) << "buffer is thread-local";
110     return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
111   }
112 
113   int64 buffer_size = allocation.size();
114   TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory out,
115                       memory_allocator->Allocate(device_ordinal, buffer_size));
116   VLOG(3) << "buffer allocated " << buffer_size << " bytes [" << out->opaque()
117           << "]";
118 
119   // Since the output buffer and all the temporary buffers were written into
120   // by the JITed code, msan has no way of knowing their memory was
121   // initialized. Mark them initialized so that msan doesn't flag loads from
122   // these buffers.
123   TF_ANNOTATE_MEMORY_IS_INITIALIZED(out->opaque(), buffer_size);
124   return MaybeOwningDeviceMemory{std::move(out)};
125 }
126 
CreateBufferTable(se::DeviceMemoryAllocator * memory_allocator,int device_ordinal,absl::Span<ExecutionInput const> arguments)127 StatusOr<std::vector<MaybeOwningDeviceMemory>> CpuExecutable::CreateBufferTable(
128     se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
129     absl::Span<ExecutionInput const> arguments) {
130   std::vector<MaybeOwningDeviceMemory> buffers(
131       assignment_->Allocations().size());
132   VLOG(3) << "Allocating " << assignment_->Allocations().size()
133           << " allocations for module " << module().name();
134   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
135        ++i) {
136     const BufferAllocation& allocation = assignment_->GetAllocation(i);
137     TF_ASSIGN_OR_RETURN(
138         buffers[i], MemoryForAllocation(allocation, arguments, memory_allocator,
139                                         device_ordinal));
140   }
141 
142   TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
143                       assignment_->GetUniqueTopLevelOutputSlice());
144   VLOG(3) << "result index: " << result_slice.index();
145   return std::move(buffers);
146 }
147 
ExecuteComputeFunction(const ExecutableRunOptions * run_options,absl::Span<MaybeOwningDeviceMemory const> buffers,HloExecutionProfile * hlo_execution_profile)148 Status CpuExecutable::ExecuteComputeFunction(
149     const ExecutableRunOptions* run_options,
150     absl::Span<MaybeOwningDeviceMemory const> buffers,
151     HloExecutionProfile* hlo_execution_profile) {
152   // The calling convention for JITed functions is:
153   //
154   //  void function(void* result, const void* run_options, void** args_array,
155   //                void** buffer_table)
156   //
157   // result: Points at the result.
158   // run_options: the ExecutableRunOptions object.
159   // args_array: null
160   // buffer_table: An array of pointers, containing pointers to temporary
161   //   buffers required by the executable adn pointers to entry computation
162   //   parameters.
163   //
164 
165   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
166 
167   XlaDebugInfoManager::Get()->OnModuleStart(module_name_);
168   auto cleanup = MakeCleanup(
169       [&]() { XlaDebugInfoManager::Get()->OnModuleStop(module_name_); });
170 
171   size_t profile_counters_size =
172       hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
173                             : 0;
174   int64* profile_counters =
175       hlo_execution_profile
176           ? hlo_execution_profile->mutable_profile_counters()->data()
177           : nullptr;
178 
179   // Call the computation function following the calling convention.
180   std::vector<void*> buffer_pointers;
181   for (auto& buffer : buffers) {
182     buffer_pointers.push_back(
183         const_cast<void*>(buffer.AsDeviceMemoryBase().opaque()));
184   }
185   TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
186                       assignment_->GetUniqueTopLevelOutputSlice());
187   void* result_buffer = buffer_pointers[result_slice.index()];
188   if (VLOG_IS_ON(3)) {
189     VLOG(3) << "Executing compute function:";
190     VLOG(3) << absl::StrFormat(
191         "  func(void* result, void* params[null], void* buffer_table[%u], "
192         "uint64 profile_counters[%u])",
193         buffer_pointers.size(), profile_counters_size);
194     VLOG(3) << absl::StrFormat("    result = %p", result_buffer);
195     auto ptr_printer = [](string* out, const void* p) {
196       absl::StrAppend(out, absl::StrFormat("%p", p));
197     };
198     VLOG(3) << "    params = nullptr";
199     VLOG(3) << absl::StrFormat(
200         "    buffer_table = [%s]",
201         absl::StrJoin(buffer_pointers, ", ", ptr_printer));
202     VLOG(3) << absl::StrFormat("    profile_counters = %p", profile_counters);
203   }
204 
205   compute_function_(result_buffer, run_options, nullptr, buffer_pointers.data(),
206                     profile_counters);
207 
208   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
209 
210   if (run_options->execution_profile()) {
211     const double nanoseconds = (end_micros - start_micros) * 1000.0;
212     run_options->execution_profile()->set_compute_time_ns(
213         std::max(nanoseconds, 1.0));
214     // If hlo profiling was disabled then the cycle count is left empty.
215     if (hlo_execution_profile) {
216       run_options->execution_profile()->set_compute_cycle_count(
217           hlo_execution_profile->total_cycles_executed(
218               *module().entry_computation()));
219     }
220   }
221 
222   return Status::OK();
223 }
224 
CreateResultShapedBuffer(const ServiceExecutableRunOptions * run_options,absl::Span<MaybeOwningDeviceMemory> buffers,absl::Span<ExecutionInput> arguments)225 StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
226     const ServiceExecutableRunOptions* run_options,
227     absl::Span<MaybeOwningDeviceMemory> buffers,
228     absl::Span<ExecutionInput> arguments) {
229   se::Stream* stream = run_options->stream();
230   ExecutionOutput result(/*on_device_shape=*/result_shape(),
231                          run_options->allocator(),
232                          stream->parent()->device_ordinal());
233   const HloInputOutputAliasConfig& input_output_alias =
234       module().input_output_alias_config();
235   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
236   const Shape& root_shape = root->shape();
237 
238   // Move se::OwningDeviceMemory values which contain the array(s) of the result
239   // into the respective location in ScopedShapedBuffer which is returned to the
240   // caller.
241   for (auto& p : result.MutableResult()->buffers()) {
242     const ShapeIndex& index = p.first;
243     se::DeviceMemoryBase& result_buffer = p.second;
244     const HloValueSet& sources = this->GetRootValueSet().element(index);
245     // The points to set is unambiguous so the set should be a
246     // singleton.
247     CHECK_EQ(1, sources.values().size());
248     const HloValue* value_source = sources.values()[0];
249     HloInstruction* src = value_source->instruction();
250 
251     // The source for this result buffer can be a nested buffer such as
252     // a tuple element.
253     TF_ASSIGN_OR_RETURN(
254         const BufferAllocation::Slice slice,
255         this->assignment_->GetUniqueSlice(src, value_source->index()));
256     const BufferAllocation::Index buffer_index = slice.index();
257 
258     // TODO(cheshire): duplication with other backends.
259     absl::optional<HloInputOutputAliasConfig::Alias> alias =
260         input_output_alias.GetAliasedParameter(index);
261     if (alias) {
262       CHECK_LT(alias->parameter_number, arguments.size());
263       ExecutionInput& input = arguments[alias->parameter_number];
264       MaybeOwningDeviceMemory* maybe_owning_memory =
265           input.MutableBuffer(alias->parameter_index);
266       if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
267         return InvalidArgument(
268             "An input was configured to be must-alias at "
269             "compile time but not donated at runtime: %s",
270             alias->ToString());
271       }
272       if (absl::optional<se::OwningDeviceMemory> owning =
273               maybe_owning_memory->Release()) {
274         // If the caller passes the ownership of the device memory, reuse it
275         // as the output buffer. It is up to the caller whether or not to
276         // donate a buffer; the aliasing information describes which buffers
277         // may alias, not buffers that must alias.
278         se::DeviceMemoryBase argument_buffer = owning->Release();
279         *maybe_owning_memory = argument_buffer;
280         result_buffer = argument_buffer;
281         // The caller is giving us the
282         // input buffer, but in case of error of the execute call, we should
283         // not be releasing it as it contains valid data (for example, it is a
284         // parameter which the user wants us to alias, in a gradient update
285         // computation). So we store the index into the result in the aliased
286         // vactor, which will be fed to the ExecutionOutput, which will be
287         // using the indices to drop the addresses from its own
288         // ScopedShapedBuffer result, if the ExecutionOutput is not committed.
289         result.AddAliasedIndex(index);
290       } else {
291         VLOG(3) << "Using copy-protection: aliasing is specified, but the "
292                    "buffer is not donated; allocating a fresh buffer";
293         int64 allocation_size =
294             ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(root_shape, index));
295         TF_ASSIGN_OR_RETURN(
296             se::OwningDeviceMemory allocated_buffer,
297             run_options->allocator()->Allocate(
298                 stream->parent()->device_ordinal(), allocation_size));
299         result_buffer = allocated_buffer.Release();
300         MaybeOwningDeviceMemory& registered_buffer = buffers[buffer_index];
301         CHECK_EQ(result_buffer.size(),
302                  registered_buffer.AsDeviceMemoryBase().size());
303         std::memcpy(/*dest=*/result_buffer.opaque(),
304                     /*src=*/registered_buffer.AsDeviceMemoryBase().opaque(),
305                     /*n=*/result_buffer.size());
306         registered_buffer = result_buffer;
307       }
308     }
309 
310     if (result_buffer.is_null()) {
311       MaybeOwningDeviceMemory& buffer = buffers[buffer_index];
312       if (absl::optional<se::OwningDeviceMemory> owned_buffer =
313               buffer.Release()) {
314         result_buffer = owned_buffer->Release();
315         buffer = result_buffer;
316       } else {
317         result_buffer = buffer.AsDeviceMemoryBase();
318         result.AddAliasedIndex(index);
319       }
320     }
321   }
322   return std::move(result);
323 }
324 
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,std::vector<ExecutionInput> arguments,HloExecutionProfile * hlo_execution_profile)325 StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
326     const ServiceExecutableRunOptions* run_options,
327     std::vector<ExecutionInput> arguments,
328     HloExecutionProfile* hlo_execution_profile) {
329   if (GetRootValueSet().IsAmbiguous()) {
330     return Unimplemented("Points-to set of root instruction is ambiguous");
331   }
332 
333   if (hlo_module_) {
334     const HloComputation* entry_comp = hlo_module_->entry_computation();
335     CHECK_EQ(entry_comp->num_parameters(), arguments.size())
336         << "Wrong number of arguments passed when running executable";
337     for (int64 i = 0; i < entry_comp->num_parameters(); ++i) {
338       const Shape& expected_shape =
339           entry_comp->parameter_instruction(i)->shape();
340       const Shape& actual_shape = arguments[i].Buffers().shape();
341       TF_RET_CHECK(
342           ShapeUtil::DynamicShapeIsCompatible(actual_shape, expected_shape))
343           << "Shape mismatch on argument " << i << ", "
344           << expected_shape.ToString(/*print_layout=*/true) << " vs. "
345           << actual_shape.ToString(/*print_layout=*/true);
346     }
347   }
348 
349   auto* host_stream = dynamic_cast<se::host::HostStream*>(
350       run_options->stream()->implementation());
351   se::Stream* stream = run_options->stream();
352   se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
353   TF_ASSIGN_OR_RETURN(
354       std::vector<MaybeOwningDeviceMemory> buffers,
355       CreateBufferTable(memory_allocator, stream->parent()->device_ordinal(),
356                         arguments));
357 
358   TF_ASSIGN_OR_RETURN(
359       ExecutionOutput result,
360       CreateResultShapedBuffer(run_options, absl::MakeSpan(buffers),
361                                absl::MakeSpan(arguments)));
362 
363   // Logically we want this lambda to capture `buffers` by move, ultimately our
364   // functor needs to be wrapped in an std::function, and that requires its
365   // functor to be copyable.  Thus we perpetrate the hack of capturing buffers
366   // "by shared pointer".
367   //
368   // We also need to change the types of some of the variables we capture:
369   // run_options needs to change from a pointer to a value type, and arguments
370   // needs to change from a Span into a vector.  We use a struct instead
371   // of a lambda to make this explicit.
372   struct AsyncRunTask {
373     CpuExecutable* executable;
374     ServiceExecutableRunOptions run_options;
375     std::shared_ptr<std::vector<MaybeOwningDeviceMemory>> task_buffers;
376     HloExecutionProfile* hlo_execution_profile;
377 
378     void operator()() {
379       // Failing a CHECK here is not great, but I don't see an obvious way to
380       // return a failed Status asynchronously.
381       TF_CHECK_OK(executable->ExecuteComputeFunction(
382           &run_options.run_options(), *task_buffers, hlo_execution_profile));
383     }
384   };
385   host_stream->EnqueueTask(
386       AsyncRunTask{this, *run_options,
387                    std::make_shared<std::vector<MaybeOwningDeviceMemory>>(
388                        std::move(buffers)),
389                    hlo_execution_profile});
390 
391   MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
392   return std::move(result);
393 }
394 
ShapeSizeBytes(const Shape & shape)395 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
396   // On the cpu, opaques are pointers.
397   if (shape.IsOpaque()) {
398     return sizeof(void*);
399   }
400   if (shape.is_static() || shape.IsTuple()) {
401     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
402   }
403   // Each dynamic dimension size is represented as a S32.
404   int64 metadata_size = sizeof(int32) * shape.dimensions_size();
405   return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
406 }
407 
GetRootValueSet() const408 const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
409   return assignment_->dataflow_analysis().GetInstructionValueSet(
410       module().entry_computation()->root_instruction());
411 }
412 
SizeOfGeneratedCodeInBytes() const413 int64 CpuExecutable::SizeOfGeneratedCodeInBytes() const {
414   return jit_->SizeOfGeneratedCodeInBytes();
415 }
416 
417 }  // namespace cpu
418 }  // namespace xla
419