1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
17
18 #include <stdint.h>
19
20 #include <algorithm>
21 #include <set>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25
26 #include "absl/strings/str_cat.h"
27 #include "absl/strings/str_format.h"
28 #include "absl/strings/str_join.h"
29 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
30 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
31 #include "tensorflow/compiler/xla/service/computation_layout.h"
32 #include "tensorflow/compiler/xla/service/hlo_computation.h"
33 #include "tensorflow/compiler/xla/service/hlo_module.h"
34 #include "tensorflow/compiler/xla/service/logical_buffer.h"
35 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
36 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
37 #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
38 #include "tensorflow/compiler/xla/shape_tree.h"
39 #include "tensorflow/compiler/xla/shape_util.h"
40 #include "tensorflow/compiler/xla/status_macros.h"
41 #include "tensorflow/compiler/xla/types.h"
42 #include "tensorflow/compiler/xla/util.h"
43 #include "tensorflow/compiler/xla/xla_data.pb.h"
44 #include "tensorflow/core/platform/env.h"
45 #include "tensorflow/core/platform/logging.h"
46 #include "tensorflow/core/platform/mem.h"
47 #include "tensorflow/core/platform/mutex.h"
48 #include "tensorflow/core/platform/types.h"
49 #include "tensorflow/stream_executor/device_memory_allocator.h"
50 #include "tensorflow/stream_executor/host/host_stream.h"
51
52 namespace xla {
53 namespace cpu {
54
CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,std::unique_ptr<const BufferAssignment> assignment,std::unique_ptr<HloModule> hlo_module,const string & entry_function_name,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)55 CpuExecutable::CpuExecutable(
56 std::unique_ptr<SimpleOrcJIT> jit,
57 std::unique_ptr<const BufferAssignment> assignment,
58 std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
59 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
60 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
61 : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
62 std::move(hlo_profile_index_map)),
63 jit_(std::move(jit)),
64 assignment_(std::move(assignment)),
65 module_name_(entry_function_name) {
66 if (assignment_) {
67 buffer_assignment_.reset(new BufferAssignmentProto(assignment_->ToProto()));
68 }
69 XlaDebugInfoManager::Get()->RegisterModule(module_name_, shared_module(),
70 buffer_assignment_);
71
72 // Resolve symbols in the constructor rather than at execution time to avoid
73 // races because FindSymbol is not thread safe.
74 llvm::Expected<llvm::JITEvaluatedSymbol> sym =
75 jit_->FindCompiledSymbol(entry_function_name);
76 // We expect to find the symbol provided with entry_function_name; otherwise
77 // this is an internal error.
78 CHECK(*sym) << "Symbol " << entry_function_name << " not found.";
79 // getAddress can do work under the hood in the jit, so it needs to be
80 // guarded by the mutex.
81 compute_function_ = reinterpret_cast<ComputeFunctionType>(sym->getAddress());
82 VLOG(1) << "compute_function_ at address "
83 << reinterpret_cast<void*>(compute_function_);
84 jit_->DoneCompiling();
85 }
86
~CpuExecutable()87 CpuExecutable::~CpuExecutable() {
88 XlaDebugInfoManager::Get()->UnregisterModule(module_name_, shared_module(),
89 buffer_assignment_);
90 }
91
MemoryForAllocation(const BufferAllocation & allocation,absl::Span<ExecutionInput const> arguments,se::DeviceMemoryAllocator * memory_allocator,int device_ordinal)92 static StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
93 const BufferAllocation& allocation,
94 absl::Span<ExecutionInput const> arguments,
95 se::DeviceMemoryAllocator* memory_allocator, int device_ordinal) {
96 VLOG(3) << allocation.ToString();
97 if (allocation.is_entry_computation_parameter()) {
98 se::DeviceMemoryBase out = arguments[allocation.parameter_number()]
99 .Buffer(allocation.param_shape_index())
100 .AsDeviceMemoryBase();
101 CHECK_EQ(allocation.size(), out.size())
102 << "Size mismatch on param " << allocation.parameter_number()
103 << " at shape index " << allocation.param_shape_index().ToString();
104 VLOG(3) << "allocation is a parameter";
105 return MaybeOwningDeviceMemory{out};
106 } else if (allocation.is_constant()) {
107 VLOG(3) << "allocation is a constant";
108 return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
109 } else if (allocation.is_thread_local()) {
110 VLOG(3) << "buffer is thread-local";
111 return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
112 }
113
114 int64_t buffer_size = allocation.size();
115 TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory out,
116 memory_allocator->Allocate(device_ordinal, buffer_size));
117 VLOG(3) << "buffer allocated " << buffer_size << " bytes [" << out->opaque()
118 << "]";
119
120 // Since the output buffer and all the temporary buffers were written into
121 // by the JITed code, msan has no way of knowing their memory was
122 // initialized. Mark them initialized so that msan doesn't flag loads from
123 // these buffers.
124 TF_ANNOTATE_MEMORY_IS_INITIALIZED(out->opaque(), buffer_size);
125 return MaybeOwningDeviceMemory{std::move(out)};
126 }
127
CreateBufferTable(se::DeviceMemoryAllocator * memory_allocator,int device_ordinal,absl::Span<ExecutionInput const> arguments)128 StatusOr<std::vector<MaybeOwningDeviceMemory>> CpuExecutable::CreateBufferTable(
129 se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
130 absl::Span<ExecutionInput const> arguments) {
131 std::vector<MaybeOwningDeviceMemory> buffers(
132 assignment_->Allocations().size());
133 VLOG(3) << "Allocating " << assignment_->Allocations().size()
134 << " allocations for module " << module().name();
135 for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
136 ++i) {
137 const BufferAllocation& allocation = assignment_->GetAllocation(i);
138 TF_ASSIGN_OR_RETURN(
139 buffers[i], MemoryForAllocation(allocation, arguments, memory_allocator,
140 device_ordinal));
141 }
142
143 if (VLOG_IS_ON(3)) {
144 TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
145 assignment_->GetUniqueTopLevelOutputSlice());
146 VLOG(3) << "result index: " << result_slice.index();
147 }
148 return std::move(buffers);
149 }
150
ExecuteComputeFunction(const ExecutableRunOptions * run_options,absl::Span<MaybeOwningDeviceMemory const> buffers,HloExecutionProfile * hlo_execution_profile)151 Status CpuExecutable::ExecuteComputeFunction(
152 const ExecutableRunOptions* run_options,
153 absl::Span<MaybeOwningDeviceMemory const> buffers,
154 HloExecutionProfile* hlo_execution_profile) {
155 // The calling convention for JITed functions is:
156 //
157 // void function(void* result, const void* run_options, void** args_array,
158 // void** buffer_table)
159 //
160 // result: Points at the result.
161 // run_options: the ExecutableRunOptions object.
162 // args_array: null
163 // buffer_table: An array of pointers, containing pointers to temporary
164 // buffers required by the executable adn pointers to entry computation
165 // parameters.
166 //
167
168 uint64 start_micros = tensorflow::Env::Default()->NowMicros();
169
170 XlaDebugInfoManager::Get()->OnModuleStart(module_name_);
171 auto cleanup = MakeCleanup(
172 [&]() { XlaDebugInfoManager::Get()->OnModuleStop(module_name_); });
173
174 size_t profile_counters_size =
175 hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
176 : 0;
177 int64* profile_counters =
178 hlo_execution_profile
179 ? hlo_execution_profile->mutable_profile_counters()->data()
180 : nullptr;
181
182 // Call the computation function following the calling convention.
183 std::vector<void*> buffer_pointers;
184 for (auto& buffer : buffers) {
185 buffer_pointers.push_back(
186 const_cast<void*>(buffer.AsDeviceMemoryBase().opaque()));
187 }
188 TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
189 assignment_->GetUniqueTopLevelOutputSlice());
190 void* result_buffer = buffer_pointers[result_slice.index()];
191 if (VLOG_IS_ON(3)) {
192 VLOG(3) << "Executing compute function:";
193 VLOG(3) << absl::StrFormat(
194 " func(void* result, void* params[null], void* buffer_table[%u], "
195 "uint64 profile_counters[%u])",
196 buffer_pointers.size(), profile_counters_size);
197 VLOG(3) << absl::StrFormat(" result = %p", result_buffer);
198 auto ptr_printer = [](string* out, const void* p) {
199 absl::StrAppend(out, absl::StrFormat("%p", p));
200 };
201 VLOG(3) << " params = nullptr";
202 VLOG(3) << absl::StrFormat(
203 " buffer_table = [%s]",
204 absl::StrJoin(buffer_pointers, ", ", ptr_printer));
205 VLOG(3) << absl::StrFormat(" profile_counters = %p", profile_counters);
206 }
207
208 compute_function_(result_buffer, run_options, nullptr, buffer_pointers.data(),
209 profile_counters);
210
211 uint64 end_micros = tensorflow::Env::Default()->NowMicros();
212
213 if (run_options->execution_profile()) {
214 const double nanoseconds = (end_micros - start_micros) * 1000.0;
215 run_options->execution_profile()->set_compute_time_ns(
216 std::max(nanoseconds, 1.0));
217 // If hlo profiling was disabled then the cycle count is left empty.
218 if (hlo_execution_profile) {
219 run_options->execution_profile()->set_compute_cycle_count(
220 hlo_execution_profile->total_cycles_executed(
221 *module().entry_computation()));
222 }
223 }
224
225 return Status::OK();
226 }
227
CreateResultShapedBuffer(const ServiceExecutableRunOptions * run_options,absl::Span<MaybeOwningDeviceMemory> buffers,absl::Span<ExecutionInput> arguments)228 StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
229 const ServiceExecutableRunOptions* run_options,
230 absl::Span<MaybeOwningDeviceMemory> buffers,
231 absl::Span<ExecutionInput> arguments) {
232 se::Stream* stream = run_options->stream();
233 ExecutionOutput result(/*on_device_shape=*/result_shape(),
234 run_options->allocator(),
235 stream->parent()->device_ordinal());
236 const HloInputOutputAliasConfig& input_output_alias =
237 module().input_output_alias_config();
238 HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
239 const Shape& root_shape = root->shape();
240
241 // Move se::OwningDeviceMemory values which contain the array(s) of the result
242 // into the respective location in ScopedShapedBuffer which is returned to the
243 // caller.
244 for (auto& p : result.MutableResult()->buffers()) {
245 const ShapeIndex& index = p.first;
246 se::DeviceMemoryBase& result_buffer = p.second;
247 const HloValueSet& sources = this->GetRootValueSet().element(index);
248 // The points to set is unambiguous so the set should be a
249 // singleton.
250 CHECK_EQ(1, sources.values().size());
251 const HloValue* value_source = sources.values()[0];
252 HloInstruction* src = value_source->instruction();
253
254 // The source for this result buffer can be a nested buffer such as
255 // a tuple element.
256 TF_ASSIGN_OR_RETURN(
257 const BufferAllocation::Slice slice,
258 this->assignment_->GetUniqueSlice(src, value_source->index()));
259 const BufferAllocation::Index buffer_index = slice.index();
260
261 // TODO(cheshire): duplication with other backends.
262 absl::optional<HloInputOutputAliasConfig::Alias> alias =
263 input_output_alias.GetAliasedParameter(index);
264 if (alias) {
265 CHECK_LT(alias->parameter_number, arguments.size());
266 ExecutionInput& input = arguments[alias->parameter_number];
267 MaybeOwningDeviceMemory* maybe_owning_memory =
268 input.MutableBuffer(alias->parameter_index);
269 if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
270 return InvalidArgument(
271 "An input was configured to be must-alias at "
272 "compile time but not donated at runtime: %s",
273 alias->ToString());
274 }
275 if (absl::optional<se::OwningDeviceMemory> owning =
276 maybe_owning_memory->Release()) {
277 // If the caller passes the ownership of the device memory, reuse it
278 // as the output buffer. It is up to the caller whether or not to
279 // donate a buffer; the aliasing information describes which buffers
280 // may alias, not buffers that must alias.
281 se::DeviceMemoryBase argument_buffer = owning->Release();
282 *maybe_owning_memory = argument_buffer;
283 result_buffer = argument_buffer;
284 // The caller is giving us the
285 // input buffer, but in case of error of the execute call, we should
286 // not be releasing it as it contains valid data (for example, it is a
287 // parameter which the user wants us to alias, in a gradient update
288 // computation). So we store the index into the result in the aliased
289 // vactor, which will be fed to the ExecutionOutput, which will be
290 // using the indices to drop the addresses from its own
291 // ScopedShapedBuffer result, if the ExecutionOutput is not committed.
292 result.AddAliasedIndex(index);
293 } else {
294 VLOG(3) << "Using copy-protection: aliasing is specified, but the "
295 "buffer is not donated; allocating a fresh buffer";
296 int64_t allocation_size =
297 ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(root_shape, index));
298 TF_ASSIGN_OR_RETURN(
299 se::OwningDeviceMemory allocated_buffer,
300 run_options->allocator()->Allocate(
301 stream->parent()->device_ordinal(), allocation_size));
302 result_buffer = allocated_buffer.Release();
303 MaybeOwningDeviceMemory& registered_buffer = buffers[buffer_index];
304 CHECK_EQ(result_buffer.size(),
305 registered_buffer.AsDeviceMemoryBase().size());
306 std::memcpy(/*dest=*/result_buffer.opaque(),
307 /*src=*/registered_buffer.AsDeviceMemoryBase().opaque(),
308 /*n=*/result_buffer.size());
309 registered_buffer = result_buffer;
310 }
311 }
312
313 if (result_buffer.is_null()) {
314 MaybeOwningDeviceMemory& buffer = buffers[buffer_index];
315 if (absl::optional<se::OwningDeviceMemory> owned_buffer =
316 buffer.Release()) {
317 result_buffer = owned_buffer->Release();
318 buffer = result_buffer;
319 } else {
320 result_buffer = buffer.AsDeviceMemoryBase();
321 result.AddAliasedIndex(index);
322 }
323 }
324 }
325 return std::move(result);
326 }
327
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,std::vector<ExecutionInput> arguments,HloExecutionProfile * hlo_execution_profile)328 StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
329 const ServiceExecutableRunOptions* run_options,
330 std::vector<ExecutionInput> arguments,
331 HloExecutionProfile* hlo_execution_profile) {
332 if (GetRootValueSet().IsAmbiguous()) {
333 return Unimplemented("Points-to set of root instruction is ambiguous");
334 }
335
336 if (hlo_module_) {
337 const HloComputation* entry_comp = hlo_module_->entry_computation();
338 CHECK_EQ(entry_comp->num_parameters(), arguments.size())
339 << "Wrong number of arguments passed when running executable";
340 for (int64_t i = 0; i < entry_comp->num_parameters(); ++i) {
341 const Shape& expected_shape =
342 entry_comp->parameter_instruction(i)->shape();
343 const Shape& actual_shape = arguments[i].Buffers().shape();
344 TF_RET_CHECK(
345 ShapeUtil::DynamicShapeIsCompatible(actual_shape, expected_shape))
346 << "Shape mismatch on argument " << i << ", "
347 << expected_shape.ToString(/*print_layout=*/true) << " vs. "
348 << actual_shape.ToString(/*print_layout=*/true);
349 }
350 }
351
352 auto* host_stream = dynamic_cast<se::host::HostStream*>(
353 run_options->stream()->implementation());
354 se::Stream* stream = run_options->stream();
355 se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
356 TF_ASSIGN_OR_RETURN(
357 std::vector<MaybeOwningDeviceMemory> buffers,
358 CreateBufferTable(memory_allocator, stream->parent()->device_ordinal(),
359 arguments));
360
361 TF_ASSIGN_OR_RETURN(
362 ExecutionOutput result,
363 CreateResultShapedBuffer(run_options, absl::MakeSpan(buffers),
364 absl::MakeSpan(arguments)));
365
366 // Logically we want this lambda to capture `buffers` by move, ultimately our
367 // functor needs to be wrapped in an std::function, and that requires its
368 // functor to be copyable. Thus we perpetrate the hack of capturing buffers
369 // "by shared pointer".
370 //
371 // We also need to change the types of some of the variables we capture:
372 // run_options needs to change from a pointer to a value type, and arguments
373 // needs to change from a Span into a vector. We use a struct instead
374 // of a lambda to make this explicit.
375 struct AsyncRunTask {
376 CpuExecutable* executable;
377 ServiceExecutableRunOptions run_options;
378 std::shared_ptr<std::vector<MaybeOwningDeviceMemory>> task_buffers;
379 HloExecutionProfile* hlo_execution_profile;
380
381 void operator()() {
382 // Failing a CHECK here is not great, but I don't see an obvious way to
383 // return a failed Status asynchronously.
384 TF_CHECK_OK(executable->ExecuteComputeFunction(
385 &run_options.run_options(), *task_buffers, hlo_execution_profile));
386 }
387 };
388 host_stream->EnqueueTask(
389 AsyncRunTask{this, *run_options,
390 std::make_shared<std::vector<MaybeOwningDeviceMemory>>(
391 std::move(buffers)),
392 hlo_execution_profile});
393
394 MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
395 return std::move(result);
396 }
397
ShapeSizeBytes(const Shape & shape)398 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
399 // On the cpu, opaques are pointers.
400 if (shape.IsOpaque()) {
401 return sizeof(void*);
402 }
403 if (shape.is_static() || shape.IsTuple()) {
404 return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
405 }
406 // Each dynamic dimension size is represented as a S32.
407 int64_t metadata_size = sizeof(int32) * shape.dimensions_size();
408 return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
409 }
410
GetRootValueSet() const411 const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
412 return assignment_->dataflow_analysis().GetInstructionValueSet(
413 module().entry_computation()->root_instruction());
414 }
415
SizeOfGeneratedCodeInBytes() const416 int64 CpuExecutable::SizeOfGeneratedCodeInBytes() const {
417 return jit_->SizeOfGeneratedCodeInBytes();
418 }
419
420 } // namespace cpu
421 } // namespace xla
422