1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
17
18 #include <stdint.h>
19
20 #include <algorithm>
21 #include <set>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25
26 #include "absl/strings/str_cat.h"
27 #include "absl/strings/str_format.h"
28 #include "absl/strings/str_join.h"
29 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
30 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
31 #include "tensorflow/compiler/xla/service/computation_layout.h"
32 #include "tensorflow/compiler/xla/service/hlo_computation.h"
33 #include "tensorflow/compiler/xla/service/hlo_module.h"
34 #include "tensorflow/compiler/xla/service/logical_buffer.h"
35 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
36 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
37 #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
38 #include "tensorflow/compiler/xla/shape_tree.h"
39 #include "tensorflow/compiler/xla/shape_util.h"
40 #include "tensorflow/compiler/xla/status_macros.h"
41 #include "tensorflow/compiler/xla/types.h"
42 #include "tensorflow/compiler/xla/util.h"
43 #include "tensorflow/compiler/xla/xla_data.pb.h"
44 #include "tensorflow/core/platform/env.h"
45 #include "tensorflow/core/platform/logging.h"
46 #include "tensorflow/core/platform/mem.h"
47 #include "tensorflow/core/platform/mutex.h"
48 #include "tensorflow/core/platform/types.h"
49 #include "tensorflow/stream_executor/device_memory_allocator.h"
50 #include "tensorflow/stream_executor/host/host_stream.h"
51
52 namespace xla {
53 namespace cpu {
54
CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,std::unique_ptr<const BufferAssignment> assignment,std::unique_ptr<HloModule> hlo_module,const string & entry_function_name,std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)55 CpuExecutable::CpuExecutable(
56 std::unique_ptr<SimpleOrcJIT> jit,
57 std::unique_ptr<const BufferAssignment> assignment,
58 std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
59 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
60 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
61 : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
62 std::move(hlo_profile_index_map)),
63 jit_(std::move(jit)),
64 assignment_(std::move(assignment)),
65 module_name_(entry_function_name) {
66 if (assignment_) {
67 buffer_assignment_.reset(new BufferAssignmentProto(assignment_->ToProto()));
68 }
69 XlaDebugInfoManager::Get()->RegisterModule(module_name_, shared_module(),
70 buffer_assignment_);
71
72 // Resolve symbols in the constructor rather than at execution time to avoid
73 // races because FindSymbol is not thread safe.
74 llvm::Expected<llvm::JITEvaluatedSymbol> sym =
75 jit_->FindCompiledSymbol(entry_function_name);
76 // We expect to find the symbol provided with entry_function_name; otherwise
77 // this is an internal error.
78 CHECK(*sym) << "Symbol " << entry_function_name << " not found.";
79 // getAddress can do work under the hood in the jit, so it needs to be
80 // guarded by the mutex.
81 compute_function_ = reinterpret_cast<ComputeFunctionType>(sym->getAddress());
82 VLOG(1) << "compute_function_ at address "
83 << reinterpret_cast<void*>(compute_function_);
84 }
85
~CpuExecutable()86 CpuExecutable::~CpuExecutable() {
87 XlaDebugInfoManager::Get()->UnregisterModule(module_name_, shared_module(),
88 buffer_assignment_);
89 }
90
MemoryForAllocation(const BufferAllocation & allocation,absl::Span<ExecutionInput const> arguments,se::DeviceMemoryAllocator * memory_allocator,int device_ordinal)91 static StatusOr<MaybeOwningDeviceMemory> MemoryForAllocation(
92 const BufferAllocation& allocation,
93 absl::Span<ExecutionInput const> arguments,
94 se::DeviceMemoryAllocator* memory_allocator, int device_ordinal) {
95 VLOG(3) << allocation.ToString();
96 if (allocation.is_entry_computation_parameter()) {
97 se::DeviceMemoryBase out = arguments[allocation.parameter_number()]
98 .Buffer(allocation.param_shape_index())
99 .AsDeviceMemoryBase();
100 CHECK_EQ(allocation.size(), out.size())
101 << "Size mismatch on param " << allocation.parameter_number()
102 << " at shape index " << allocation.param_shape_index().ToString();
103 VLOG(3) << "allocation is a parameter";
104 return MaybeOwningDeviceMemory{out};
105 } else if (allocation.is_constant()) {
106 VLOG(3) << "allocation is a constant";
107 return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
108 } else if (allocation.is_thread_local()) {
109 VLOG(3) << "buffer is thread-local";
110 return MaybeOwningDeviceMemory{se::DeviceMemoryBase{}};
111 }
112
113 int64 buffer_size = allocation.size();
114 TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory out,
115 memory_allocator->Allocate(device_ordinal, buffer_size));
116 VLOG(3) << "buffer allocated " << buffer_size << " bytes [" << out->opaque()
117 << "]";
118
119 // Since the output buffer and all the temporary buffers were written into
120 // by the JITed code, msan has no way of knowing their memory was
121 // initialized. Mark them initialized so that msan doesn't flag loads from
122 // these buffers.
123 TF_ANNOTATE_MEMORY_IS_INITIALIZED(out->opaque(), buffer_size);
124 return MaybeOwningDeviceMemory{std::move(out)};
125 }
126
CreateBufferTable(se::DeviceMemoryAllocator * memory_allocator,int device_ordinal,absl::Span<ExecutionInput const> arguments)127 StatusOr<std::vector<MaybeOwningDeviceMemory>> CpuExecutable::CreateBufferTable(
128 se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
129 absl::Span<ExecutionInput const> arguments) {
130 std::vector<MaybeOwningDeviceMemory> buffers(
131 assignment_->Allocations().size());
132 VLOG(3) << "Allocating " << assignment_->Allocations().size()
133 << " allocations for module " << module().name();
134 for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
135 ++i) {
136 const BufferAllocation& allocation = assignment_->GetAllocation(i);
137 TF_ASSIGN_OR_RETURN(
138 buffers[i], MemoryForAllocation(allocation, arguments, memory_allocator,
139 device_ordinal));
140 }
141
142 TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
143 assignment_->GetUniqueTopLevelOutputSlice());
144 VLOG(3) << "result index: " << result_slice.index();
145 return std::move(buffers);
146 }
147
ExecuteComputeFunction(const ExecutableRunOptions * run_options,absl::Span<MaybeOwningDeviceMemory const> buffers,HloExecutionProfile * hlo_execution_profile)148 Status CpuExecutable::ExecuteComputeFunction(
149 const ExecutableRunOptions* run_options,
150 absl::Span<MaybeOwningDeviceMemory const> buffers,
151 HloExecutionProfile* hlo_execution_profile) {
152 // The calling convention for JITed functions is:
153 //
154 // void function(void* result, const void* run_options, void** args_array,
155 // void** buffer_table)
156 //
157 // result: Points at the result.
158 // run_options: the ExecutableRunOptions object.
159 // args_array: null
160 // buffer_table: An array of pointers, containing pointers to temporary
161 // buffers required by the executable adn pointers to entry computation
162 // parameters.
163 //
164
165 uint64 start_micros = tensorflow::Env::Default()->NowMicros();
166
167 XlaDebugInfoManager::Get()->OnModuleStart(module_name_);
168 auto cleanup = MakeCleanup(
169 [&]() { XlaDebugInfoManager::Get()->OnModuleStop(module_name_); });
170
171 size_t profile_counters_size =
172 hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
173 : 0;
174 int64* profile_counters =
175 hlo_execution_profile
176 ? hlo_execution_profile->mutable_profile_counters()->data()
177 : nullptr;
178
179 // Call the computation function following the calling convention.
180 std::vector<void*> buffer_pointers;
181 for (auto& buffer : buffers) {
182 buffer_pointers.push_back(
183 const_cast<void*>(buffer.AsDeviceMemoryBase().opaque()));
184 }
185 TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice result_slice,
186 assignment_->GetUniqueTopLevelOutputSlice());
187 void* result_buffer = buffer_pointers[result_slice.index()];
188 if (VLOG_IS_ON(3)) {
189 VLOG(3) << "Executing compute function:";
190 VLOG(3) << absl::StrFormat(
191 " func(void* result, void* params[null], void* buffer_table[%u], "
192 "uint64 profile_counters[%u])",
193 buffer_pointers.size(), profile_counters_size);
194 VLOG(3) << absl::StrFormat(" result = %p", result_buffer);
195 auto ptr_printer = [](string* out, const void* p) {
196 absl::StrAppend(out, absl::StrFormat("%p", p));
197 };
198 VLOG(3) << " params = nullptr";
199 VLOG(3) << absl::StrFormat(
200 " buffer_table = [%s]",
201 absl::StrJoin(buffer_pointers, ", ", ptr_printer));
202 VLOG(3) << absl::StrFormat(" profile_counters = %p", profile_counters);
203 }
204
205 compute_function_(result_buffer, run_options, nullptr, buffer_pointers.data(),
206 profile_counters);
207
208 uint64 end_micros = tensorflow::Env::Default()->NowMicros();
209
210 if (run_options->execution_profile()) {
211 const double nanoseconds = (end_micros - start_micros) * 1000.0;
212 run_options->execution_profile()->set_compute_time_ns(
213 std::max(nanoseconds, 1.0));
214 // If hlo profiling was disabled then the cycle count is left empty.
215 if (hlo_execution_profile) {
216 run_options->execution_profile()->set_compute_cycle_count(
217 hlo_execution_profile->total_cycles_executed(
218 *module().entry_computation()));
219 }
220 }
221
222 return Status::OK();
223 }
224
CreateResultShapedBuffer(const ServiceExecutableRunOptions * run_options,absl::Span<MaybeOwningDeviceMemory> buffers,absl::Span<ExecutionInput> arguments)225 StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
226 const ServiceExecutableRunOptions* run_options,
227 absl::Span<MaybeOwningDeviceMemory> buffers,
228 absl::Span<ExecutionInput> arguments) {
229 se::Stream* stream = run_options->stream();
230 ExecutionOutput result(/*on_device_shape=*/result_shape(),
231 run_options->allocator(),
232 stream->parent()->device_ordinal());
233 const HloInputOutputAliasConfig& input_output_alias =
234 module().input_output_alias_config();
235 HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
236 const Shape& root_shape = root->shape();
237
238 // Move se::OwningDeviceMemory values which contain the array(s) of the result
239 // into the respective location in ScopedShapedBuffer which is returned to the
240 // caller.
241 for (auto& p : result.MutableResult()->buffers()) {
242 const ShapeIndex& index = p.first;
243 se::DeviceMemoryBase& result_buffer = p.second;
244 const HloValueSet& sources = this->GetRootValueSet().element(index);
245 // The points to set is unambiguous so the set should be a
246 // singleton.
247 CHECK_EQ(1, sources.values().size());
248 const HloValue* value_source = sources.values()[0];
249 HloInstruction* src = value_source->instruction();
250
251 // The source for this result buffer can be a nested buffer such as
252 // a tuple element.
253 TF_ASSIGN_OR_RETURN(
254 const BufferAllocation::Slice slice,
255 this->assignment_->GetUniqueSlice(src, value_source->index()));
256 const BufferAllocation::Index buffer_index = slice.index();
257
258 // TODO(cheshire): duplication with other backends.
259 absl::optional<HloInputOutputAliasConfig::Alias> alias =
260 input_output_alias.GetAliasedParameter(index);
261 if (alias) {
262 CHECK_LT(alias->parameter_number, arguments.size());
263 ExecutionInput& input = arguments[alias->parameter_number];
264 MaybeOwningDeviceMemory* maybe_owning_memory =
265 input.MutableBuffer(alias->parameter_index);
266 if (alias->must_alias() && !maybe_owning_memory->HasOwnership()) {
267 return InvalidArgument(
268 "An input was configured to be must-alias at "
269 "compile time but not donated at runtime: %s",
270 alias->ToString());
271 }
272 if (absl::optional<se::OwningDeviceMemory> owning =
273 maybe_owning_memory->Release()) {
274 // If the caller passes the ownership of the device memory, reuse it
275 // as the output buffer. It is up to the caller whether or not to
276 // donate a buffer; the aliasing information describes which buffers
277 // may alias, not buffers that must alias.
278 se::DeviceMemoryBase argument_buffer = owning->Release();
279 *maybe_owning_memory = argument_buffer;
280 result_buffer = argument_buffer;
281 // The caller is giving us the
282 // input buffer, but in case of error of the execute call, we should
283 // not be releasing it as it contains valid data (for example, it is a
284 // parameter which the user wants us to alias, in a gradient update
285 // computation). So we store the index into the result in the aliased
286 // vactor, which will be fed to the ExecutionOutput, which will be
287 // using the indices to drop the addresses from its own
288 // ScopedShapedBuffer result, if the ExecutionOutput is not committed.
289 result.AddAliasedIndex(index);
290 } else {
291 VLOG(3) << "Using copy-protection: aliasing is specified, but the "
292 "buffer is not donated; allocating a fresh buffer";
293 int64 allocation_size =
294 ShapeUtil::ByteSizeOf(ShapeUtil::GetSubshape(root_shape, index));
295 TF_ASSIGN_OR_RETURN(
296 se::OwningDeviceMemory allocated_buffer,
297 run_options->allocator()->Allocate(
298 stream->parent()->device_ordinal(), allocation_size));
299 result_buffer = allocated_buffer.Release();
300 MaybeOwningDeviceMemory& registered_buffer = buffers[buffer_index];
301 CHECK_EQ(result_buffer.size(),
302 registered_buffer.AsDeviceMemoryBase().size());
303 std::memcpy(/*dest=*/result_buffer.opaque(),
304 /*src=*/registered_buffer.AsDeviceMemoryBase().opaque(),
305 /*n=*/result_buffer.size());
306 registered_buffer = result_buffer;
307 }
308 }
309
310 if (result_buffer.is_null()) {
311 MaybeOwningDeviceMemory& buffer = buffers[buffer_index];
312 if (absl::optional<se::OwningDeviceMemory> owned_buffer =
313 buffer.Release()) {
314 result_buffer = owned_buffer->Release();
315 buffer = result_buffer;
316 } else {
317 result_buffer = buffer.AsDeviceMemoryBase();
318 result.AddAliasedIndex(index);
319 }
320 }
321 }
322 return std::move(result);
323 }
324
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,std::vector<ExecutionInput> arguments,HloExecutionProfile * hlo_execution_profile)325 StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
326 const ServiceExecutableRunOptions* run_options,
327 std::vector<ExecutionInput> arguments,
328 HloExecutionProfile* hlo_execution_profile) {
329 if (GetRootValueSet().IsAmbiguous()) {
330 return Unimplemented("Points-to set of root instruction is ambiguous");
331 }
332
333 if (hlo_module_) {
334 const HloComputation* entry_comp = hlo_module_->entry_computation();
335 CHECK_EQ(entry_comp->num_parameters(), arguments.size())
336 << "Wrong number of arguments passed when running executable";
337 for (int64 i = 0; i < entry_comp->num_parameters(); ++i) {
338 const Shape& expected_shape =
339 entry_comp->parameter_instruction(i)->shape();
340 const Shape& actual_shape = arguments[i].Buffers().shape();
341 TF_RET_CHECK(
342 ShapeUtil::DynamicShapeIsCompatible(actual_shape, expected_shape))
343 << "Shape mismatch on argument " << i << ", "
344 << expected_shape.ToString(/*print_layout=*/true) << " vs. "
345 << actual_shape.ToString(/*print_layout=*/true);
346 }
347 }
348
349 auto* host_stream = dynamic_cast<se::host::HostStream*>(
350 run_options->stream()->implementation());
351 se::Stream* stream = run_options->stream();
352 se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
353 TF_ASSIGN_OR_RETURN(
354 std::vector<MaybeOwningDeviceMemory> buffers,
355 CreateBufferTable(memory_allocator, stream->parent()->device_ordinal(),
356 arguments));
357
358 TF_ASSIGN_OR_RETURN(
359 ExecutionOutput result,
360 CreateResultShapedBuffer(run_options, absl::MakeSpan(buffers),
361 absl::MakeSpan(arguments)));
362
363 // Logically we want this lambda to capture `buffers` by move, ultimately our
364 // functor needs to be wrapped in an std::function, and that requires its
365 // functor to be copyable. Thus we perpetrate the hack of capturing buffers
366 // "by shared pointer".
367 //
368 // We also need to change the types of some of the variables we capture:
369 // run_options needs to change from a pointer to a value type, and arguments
370 // needs to change from a Span into a vector. We use a struct instead
371 // of a lambda to make this explicit.
372 struct AsyncRunTask {
373 CpuExecutable* executable;
374 ServiceExecutableRunOptions run_options;
375 std::shared_ptr<std::vector<MaybeOwningDeviceMemory>> task_buffers;
376 HloExecutionProfile* hlo_execution_profile;
377
378 void operator()() {
379 // Failing a CHECK here is not great, but I don't see an obvious way to
380 // return a failed Status asynchronously.
381 TF_CHECK_OK(executable->ExecuteComputeFunction(
382 &run_options.run_options(), *task_buffers, hlo_execution_profile));
383 }
384 };
385 host_stream->EnqueueTask(
386 AsyncRunTask{this, *run_options,
387 std::make_shared<std::vector<MaybeOwningDeviceMemory>>(
388 std::move(buffers)),
389 hlo_execution_profile});
390
391 MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
392 return std::move(result);
393 }
394
ShapeSizeBytes(const Shape & shape)395 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
396 // On the cpu, opaques are pointers.
397 if (shape.IsOpaque()) {
398 return sizeof(void*);
399 }
400 if (shape.is_static() || shape.IsTuple()) {
401 return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
402 }
403 // Each dynamic dimension size is represented as a S32.
404 int64 metadata_size = sizeof(int32) * shape.dimensions_size();
405 return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
406 }
407
GetRootValueSet() const408 const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
409 return assignment_->dataflow_analysis().GetInstructionValueSet(
410 module().entry_computation()->root_instruction());
411 }
412
SizeOfGeneratedCodeInBytes() const413 int64 CpuExecutable::SizeOfGeneratedCodeInBytes() const {
414 return jit_->SizeOfGeneratedCodeInBytes();
415 }
416
417 } // namespace cpu
418 } // namespace xla
419