1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/compiler/xla/service/interpreter/executable_base.h"
17
18 #include <type_traits>
19 #include <vector>
20
21 #include "tensorflow/compiler/xla/literal.h"
22 #include "tensorflow/compiler/xla/service/hlo_computation.h"
23 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
24 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
25 #include "tensorflow/compiler/xla/service/transfer_manager.h"
26 #include "tensorflow/compiler/xla/shape_tree.h"
27 #include "tensorflow/compiler/xla/shape_util.h"
28 #include "tensorflow/stream_executor/lib/statusor.h"
29 #include "tensorflow/stream_executor/platform.h"
30 #include "tensorflow/stream_executor/stream.h"
31 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
32
33 namespace xla {
34 namespace interpreter {
35
InterpreterExecutableBase(std::unique_ptr<HloModule> hlo_module)36 InterpreterExecutableBase::InterpreterExecutableBase(
37 std::unique_ptr<HloModule> hlo_module)
38 : Executable(std::move(hlo_module), /*hlo_profile_printer_data=*/nullptr,
39 /*hlo_profile_index_map=*/nullptr) {}
40
ExecuteAsyncOnStream(const ServiceExecutableRunOptions * run_options,std::vector<ExecutionInput> arguments,HloExecutionProfile * hlo_execution_profile)41 StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
42 const ServiceExecutableRunOptions* run_options,
43 std::vector<ExecutionInput> arguments,
44 HloExecutionProfile* hlo_execution_profile) {
45 se::Stream* stream = run_options->stream();
46 se::StreamExecutor* executor = stream->parent();
47 const se::Platform* platform = executor->platform();
48
49 // Convert the ShapeTree to a ShapedBuffer. We do this so we can call
50 // TransferManager methods below.
51 std::vector<ShapedBuffer> argument_buffers;
52 argument_buffers.reserve(arguments.size());
53 int device_ordinal = run_options->device_ordinal();
54 if (device_ordinal < 0) {
55 device_ordinal = 0;
56 }
57 for (auto& argument : arguments) {
58 const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
59 argument_buffers.push_back(ShapedBuffer(buffers.shape(),
60 /*device_ordinal=*/device_ordinal));
61 auto in_it = buffers.begin();
62 auto out_it = argument_buffers.back().buffers().begin();
63 for (; in_it != buffers.end(); ++in_it, ++out_it) {
64 out_it->second = in_it->second.AsDeviceMemoryBase();
65 }
66 }
67
68 VLOG(1) << "Execute " << module().name();
69 if (VLOG_IS_ON(2)) {
70 for (const auto& a : argument_buffers) {
71 VLOG(2) << "-- argument " << a;
72 }
73 }
74
75 uint64_t start_micros = tensorflow::Env::Default()->NowMicros();
76
77 const HloComputation* computation = module().entry_computation();
78 if (computation->num_parameters() != arguments.size()) {
79 return tensorflow::errors::Internal(
80 "Mismatch between argument count and graph parameter count.");
81 }
82
83 // Check that the args have the right shape.
84 for (int64_t i = 0; i < computation->num_parameters(); ++i) {
85 const auto& expected_shape = computation->parameter_instruction(i)->shape();
86 const auto& actual_shape = argument_buffers[i].on_device_shape();
87 bool shape_match = true;
88 if (expected_shape.is_dynamic()) {
89 if (!ShapeUtil::DynamicArrayShapeIsCompatible(actual_shape,
90 expected_shape)) {
91 shape_match = false;
92 }
93 } else if (!Shape::Equal().MinorToMajorOnlyInLayout()(expected_shape,
94 actual_shape)) {
95 shape_match = false;
96 }
97 if (!shape_match) {
98 return InvalidArgument(
99 "Shape mismatch on parameter %d. Expected %s, but was %s.", i,
100 ShapeUtil::HumanStringWithLayout(expected_shape),
101 ShapeUtil::HumanStringWithLayout(actual_shape));
102 }
103 }
104
105 TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
106 TransferManager::GetForPlatform(platform));
107
108 // Transform the ShapedBuffer arguments into literals which the evaluator
109 // consumes.
110 std::vector<Literal> arg_literals;
111 const int64_t num_parameters = computation->num_parameters();
112 arg_literals.reserve(num_parameters);
113 for (int64_t p = 0; p < num_parameters; ++p) {
114 TF_ASSIGN_OR_RETURN(Literal arg_literal,
115 transfer_manager->TransferLiteralFromDevice(
116 run_options->stream(), argument_buffers[p]));
117 const auto& expected_shape = computation->parameter_instruction(p)->shape();
118 if (expected_shape.is_dynamic()) {
119 // Expand the input literal to expected shape.
120 arg_literal = arg_literal.ToBoundedDynamic(expected_shape);
121 }
122 arg_literals.push_back(std::move(arg_literal));
123 }
124
125 TF_ASSIGN_OR_RETURN(Literal result_literal,
126 Evaluate(run_options, *computation, arg_literals));
127 // Shrink the generated dynamic shape into static shape.
128 result_literal = result_literal.ToStatic();
129
130 // Transform the result literal back into a ShapedBuffer.
131 const HloInputOutputAliasConfig& alias_config =
132 hlo_module_ == nullptr ? HloInputOutputAliasConfig()
133 : hlo_module_->input_output_alias_config();
134 TF_ASSIGN_OR_RETURN(ExecutionOutput result,
135 AllocateOutputMemoryWithInputReuse(
136 result_literal.shape(), alias_config,
137 run_options->allocator(), &arguments, stream));
138
139 TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralToDevice(
140 run_options->stream(), result_literal, result.Result()));
141
142 uint64_t end_micros = tensorflow::Env::Default()->NowMicros();
143
144 ExecutionProfile* profile = run_options->run_options().execution_profile();
145 if (profile) {
146 const double nanoseconds = (end_micros - start_micros) * 1000.0;
147 profile->set_compute_time_ns(std::max(nanoseconds, 1.0));
148 }
149 MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
150 return std::move(result);
151 }
152
153 StatusOr<ExecutionOutput>
AllocateOutputMemoryWithInputReuse(const Shape & shape,const HloInputOutputAliasConfig & alias_config,se::DeviceMemoryAllocator * allocator,std::vector<ExecutionInput> * arguments,se::Stream * stream)154 InterpreterExecutableBase::AllocateOutputMemoryWithInputReuse(
155 const Shape& shape, const HloInputOutputAliasConfig& alias_config,
156 se::DeviceMemoryAllocator* allocator,
157 std::vector<ExecutionInput>* arguments, se::Stream* stream) {
158 TF_RETURN_IF_ERROR(alias_config.ForEachAliasWithStatus(
159 [&](const ShapeIndex& output_index,
160 std::optional<HloInputOutputAliasConfig::Alias> alias) {
161 if (alias && alias->must_alias()) {
162 VLOG(1) << alias->ToString();
163 const MaybeOwningDeviceMemory& original_input =
164 (*arguments)[alias->parameter_number].Buffers().element(
165 alias->parameter_index);
166 if (!original_input.HasOwnership()) {
167 return InvalidArgument(
168 "An input was configured to be must-alias at "
169 "compile time but not donated at runtime: %s",
170 alias->ToString());
171 }
172 }
173 return OkStatus();
174 }));
175
176 se::StreamExecutor* executor = stream->parent();
177 const se::Platform* platform = executor->platform();
178 TF_ASSIGN_OR_RETURN(TransferManager * transfer_manager,
179 TransferManager::GetForPlatform(platform));
180
181 ExecutionOutput result(shape, allocator, executor->device_ordinal());
182 for (auto& pair : result.MutableResult()->buffers()) {
183 const ShapeIndex& result_index = pair.first;
184 se::DeviceMemoryBase& result_buffer = pair.second;
185 int64_t allocation_bytes =
186 transfer_manager->GetByteSizeRequirement(ShapeUtil::GetSubshape(
187 result.Result().on_device_shape(), result_index));
188
189 if (!ShapeUtil::IndexIsValid(alias_config.shape(), result_index)) {
190 return InternalError("result_index is invalid: %s",
191 result_index.ToString());
192 }
193
194 std::optional<HloInputOutputAliasConfig::Alias> alias =
195 alias_config.GetAliasedParameter(result_index);
196 if (alias) {
197 TF_RET_CHECK(alias->parameter_number < arguments->size());
198 ExecutionInput& input = (*arguments)[alias->parameter_number];
199 MaybeOwningDeviceMemory* device_memory =
200 input.MutableBuffer(alias->parameter_index);
201 if (auto owning = device_memory->Release()) {
202 se::DeviceMemoryBase device_memory_base = owning->Release();
203 *device_memory = device_memory_base;
204 result_buffer = device_memory_base;
205 result.AddAliasedIndex(result_index);
206 } else {
207 VLOG(2) << "An input was not reused since it is not donated "
208 << alias->ToString();
209 }
210 }
211
212 if (result_buffer.is_null()) {
213 const Shape& on_device_shape = result.Result().on_device_shape();
214 const Shape& on_device_subshape =
215 ShapeUtil::GetSubshape(on_device_shape, result_index);
216 TF_ASSIGN_OR_RETURN(
217 auto allocated_buffer,
218 allocator->Allocate(executor->device_ordinal(), allocation_bytes,
219 /*retry_on_failure=*/true,
220 LayoutUtil::MemorySpace(on_device_subshape)));
221 result_buffer = allocated_buffer.Release();
222 }
223 TF_RET_CHECK(allocation_bytes == 0 || result_buffer != nullptr);
224 }
225
226 TF_RETURN_IF_ERROR(
227 transfer_manager->WriteTupleIndexTables(stream, result.Result()));
228 return std::move(result);
229 }
230
231 } // namespace interpreter
232 } // namespace xla
233