1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 vcyou may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/kernels/hexagon/hexagon_control_wrapper.h"
17
18 #include "tensorflow/core/framework/graph_transfer_info.pb.h"
19 #include "tensorflow/core/framework/remote_fused_graph_execute_info.pb.h"
20 #include "tensorflow/core/framework/tensor_shape.pb.h"
21 #include "tensorflow/core/kernels/hexagon/hexagon_ops_definitions.h"
22 #include "tensorflow/core/kernels/hexagon/soc_interface.h"
23 #include "tensorflow/core/platform/profile_utils/cpu_utils.h"
24
25 namespace tensorflow {
26
27 constexpr const char* const OUTPUT_OP_NAME = "OUTPUT";
28 constexpr const char* const REMOTE_FUSED_GRAPH_NODE_NAME_PREFIX =
29 "hexagon_remote_fused_graph";
30 /* static */ constexpr const char* const
31 HexagonControlWrapper::REMOTE_FUSED_GRAPH_EXECUTOR_NAME;
32
33 constexpr int ALIGNMENT_BYTES = 16;
34 constexpr int MAX_IN_OUT_COUNT = 128;
35
36 const bool DBG_DUMP_VERIFICATION_STRING = false;
37 const int DBG_LEVEL = 0; // -2: verbose, -1: debug, 0: info
38 const bool DBG_USE_DUMMY_INPUT = false;
39 const bool DBG_USE_SAMPLE_INPUT = false;
40 const int64 FLAG_ENABLE_PANDA_BINARY_INPUT = 0x01;
41 const bool DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA = false;
42
AddPort(const string & node_name)43 static string AddPort(const string& node_name) {
44 if (node_name.find(':') != string::npos) {
45 return node_name;
46 } else {
47 return strings::StrCat(node_name, ":", 0);
48 }
49 }
50
FindAlignedPointer(uint8 * ptr)51 static uint8* FindAlignedPointer(uint8* ptr) {
52 const uintptr_t data_ptr_int = reinterpret_cast<uintptr_t>(ptr);
53 const int shift_count =
54 (ALIGNMENT_BYTES - data_ptr_int % ALIGNMENT_BYTES) % ALIGNMENT_BYTES;
55 uint8* data_ptr = ptr + shift_count;
56 return data_ptr;
57 }
58
FindNodeInfo(const string & name,GraphTransferInfo * graph_transfer_info)59 /* static */ GraphTransferNodeInfo* HexagonControlWrapper::FindNodeInfo(
60 const string& name, GraphTransferInfo* graph_transfer_info) {
61 for (GraphTransferNodeInfo& node_info :
62 *graph_transfer_info->mutable_node_info()) {
63 if (node_info.name() == name) {
64 return &node_info;
65 }
66 }
67 return nullptr;
68 }
69
GetVersion()70 int HexagonControlWrapper::GetVersion() {
71 return soc_interface_GetSocControllerVersion();
72 }
73
Init(const RemoteFusedGraphExecuteInfo & info)74 bool HexagonControlWrapper::Init(const RemoteFusedGraphExecuteInfo& info) {
75 soc_interface_SetLogLevel(DBG_LEVEL);
76 if (DBG_USE_SAMPLE_INPUT) {
77 soc_interface_SetDebugFlag(FLAG_ENABLE_PANDA_BINARY_INPUT);
78 }
79 if (info.serialized_executor_parameters().empty()) {
80 std::vector<std::pair<string, Tensor>> inputs;
81 std::vector<string> outputs;
82 RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
83 info, &inputs, &outputs);
84 Status status = graph_transferer_.LoadGraphFromProto(
85 HexagonOpsDefinitions::getInstance(), info.remote_graph(), inputs,
86 outputs,
87 false // shape_inference_for_unknown_shape
88 );
89 TF_CHECK_OK(status) << status;
90 } else {
91 // If graph transfer info is attached, just import it.
92 graph_transferer_.SetSerializedGraphTransferInfo(
93 info.serialized_executor_parameters());
94 }
95 execute_info_ = &info;
96 bool success = soc_interface_Init();
97 if (!success) {
98 LOG(ERROR) << "Hexagon initialization was failed. See log output.";
99 return false;
100 }
101 std::vector<int> input_sizes;
102 std::vector<int> output_sizes;
103 CHECK_NOTNULL(execute_info_);
104 for (int i = 0; i < execute_info_->graph_input_node_name_size(); ++i) {
105 const string& input = execute_info_->graph_input_node_name(i);
106 LOG(INFO) << "Add input: " << input << ", " << i;
107 CHECK(input_port_map_.emplace(AddPort(input), i).second);
108 const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type =
109 execute_info_->default_graph_input_tensor_shape(i);
110 int64 buf_size = DataTypeSize(shape_type.dtype());
111 for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) {
112 buf_size *= dim.size();
113 }
114 input_sizes.emplace_back(static_cast<int>(buf_size));
115 }
116 for (int i = 0; i < execute_info_->graph_output_node_name_size(); ++i) {
117 const string& output = execute_info_->graph_output_node_name(i);
118 CHECK(output_port_map_.emplace(AddPort(output), i).second);
119 const RemoteFusedGraphExecuteInfo::TensorShapeTypeProto& shape_type =
120 execute_info_->default_graph_output_tensor_shape(i);
121
122 int64 buf_size = DataTypeSize(shape_type.dtype());
123 for (const TensorShapeProto::Dim& dim : shape_type.shape().dim()) {
124 buf_size *= dim.size();
125 }
126 output_sizes.emplace_back(static_cast<int>(buf_size));
127 }
128
129 LOG(INFO) << "Allocate inout buffer";
130 success &= soc_interface_AllocateInOutNodeBuffers(
131 input_sizes.size(), input_sizes.data(), output_sizes.size(),
132 output_sizes.data());
133 return success;
134 }
135
Finalize()136 bool HexagonControlWrapper::Finalize() { return soc_interface_Finalize(); }
SetupGraph()137 bool HexagonControlWrapper::SetupGraph() {
138 // Copy graph transfer info to modify to adapt hexnn library
139 GraphTransferInfo& graph_transfer_info =
140 graph_transferer_.GetMutableGraphTransferInfo();
141
142 // Overwrite op type of input nodes for hexagon
143 for (const GraphTransferGraphInputNodeInfo& graph_input :
144 graph_transfer_info.graph_input_node_info()) {
145 GraphTransferNodeInfo* node_info =
146 FindNodeInfo(graph_input.name(), &graph_transfer_info);
147 CHECK_NE(node_info, nullptr);
148 }
149
150 // Generate a new output node which is connected to graph output node
151 // TODO(satok): Support multiple output nodes
152 CHECK_EQ(graph_transfer_info.graph_output_node_info_size(), 1);
153 for (const GraphTransferGraphOutputNodeInfo& graph_output :
154 graph_transfer_info.graph_output_node_info()) {
155 const int new_output_node_id = graph_transfer_info.node_info_size() +
156 graph_transfer_info.const_node_info_size() +
157 2 /* offset for ids */;
158 // Register a new output node
159 GraphTransferNodeInfo& new_output_node_info =
160 *graph_transfer_info.add_node_info();
161 new_output_node_info.set_name(OUTPUT_OP_NAME);
162 new_output_node_info.set_node_id(new_output_node_id);
163 new_output_node_info.set_type_name(OUTPUT_OP_NAME);
164 new_output_node_info.set_soc_op_id(
165 HexagonOpsDefinitions::getInstance().GetOpIdFor(OUTPUT_OP_NAME, {}));
166 new_output_node_info.set_padding_id(0 /* PADDING_NA_ID */);
167 new_output_node_info.set_input_count(1);
168 new_output_node_info.set_output_count(0);
169
170 const TensorId tid = ParseTensorName(graph_output.name());
171 const string node_name(tid.first);
172 const int port = tid.second;
173 // Register node input for the new output node
174 const GraphTransferNodeInfo* node_info =
175 FindNodeInfo(node_name, &graph_transfer_info);
176 CHECK_NE(node_info, nullptr);
177 GraphTransferNodeInputInfo& node_input_info =
178 *graph_transfer_info.add_node_input_info();
179 node_input_info.set_node_id(new_output_node_id);
180 GraphTransferNodeInput& node_input = *node_input_info.add_node_input();
181 node_input.set_node_id(node_info->node_id());
182 node_input.set_output_port(port);
183 }
184
185 if (DBG_DUMP_VERIFICATION_STRING) {
186 GraphTransferer gt;
187 gt.SetSerializedGraphTransferInfo(graph_transfer_info.SerializeAsString());
188 gt.DumpVerificationStringOfNodeTransferParams();
189 }
190
191 int inputs_count = 0;
192 int outputs_count = 0;
193 for (const GraphTransferNodeInputInfo& input_params :
194 graph_transfer_info.node_input_info()) {
195 inputs_count += input_params.node_input_size();
196 }
197
198 for (const GraphTransferNodeOutputInfo& output_params :
199 graph_transfer_info.node_output_info()) {
200 outputs_count += output_params.max_byte_size_size();
201 }
202 // Allocate memory for node inputs and node outputs
203 soc_interface_AllocateNodeInputAndNodeOutputArray(inputs_count,
204 outputs_count);
205
206 // Construct node input parameters
207 std::unordered_map<int, std::tuple<void*, int>> inputs_map;
208 for (const GraphTransferNodeInputInfo& input_params :
209 graph_transfer_info.node_input_info()) {
210 const int count = input_params.node_input_size();
211 CHECK(count <= MAX_IN_OUT_COUNT);
212 int node_ids[MAX_IN_OUT_COUNT];
213 int ports[MAX_IN_OUT_COUNT];
214 for (int i = 0; i < count; ++i) {
215 const GraphTransferNodeInput& node_input = input_params.node_input(i);
216 node_ids[i] = node_input.node_id() + NODE_ID_OFFSET;
217 ports[i] = node_input.output_port();
218 }
219 void* inputs_ptr = soc_interface_SetOneNodeInputs(count, node_ids, ports);
220 const int node_id = input_params.node_id();
221 CHECK(inputs_map.count(node_id) == 0);
222 inputs_map.emplace(node_id, std::make_tuple(inputs_ptr, count));
223 }
224
225 // Construct node output parameters
226 std::unordered_map<int, std::tuple<void*, int>> outputs_map;
227 for (const GraphTransferNodeOutputInfo& output_params :
228 graph_transfer_info.node_output_info()) {
229 const int count = output_params.max_byte_size_size();
230 CHECK(count <= MAX_IN_OUT_COUNT);
231 int sizes[MAX_IN_OUT_COUNT];
232 for (int i = 0; i < count; ++i) {
233 const int size = output_params.max_byte_size(i);
234 sizes[i] = size;
235 }
236 void* outputs_ptr = soc_interface_SetOneNodeOutputs(count, sizes);
237 const int node_id = output_params.node_id();
238 CHECK(outputs_map.count(node_id) == 0);
239 outputs_map.emplace(node_id, std::make_tuple(outputs_ptr, count));
240 }
241
242 // Instantiate graph
243 soc_interface_InstantiateGraph();
244
245 // Initialize graph
246 // 1. Setup const nodes
247 for (const GraphTransferConstNodeInfo& params :
248 graph_transfer_info.const_node_info()) {
249 const int node_id = params.node_id();
250 // TODO(satok): Stop assuming shape size is 4.
251 CHECK(params.shape_size() == 4);
252 const int64 shape_0 = params.shape(0);
253 const int64 shape_1 = params.shape(1);
254 const int64 shape_2 = params.shape(2);
255 const int64 shape_3 = params.shape(3);
256 const int data_size = params.data().length();
257 CHECK(dummy_const_data_.count(node_id) == 0);
258 auto data = dummy_const_data_.emplace(
259 std::piecewise_construct, std::make_tuple(node_id), std::make_tuple());
260 CHECK(data.second);
261 data.first->second.resize(data_size + ALIGNMENT_BYTES - 1);
262 uint8* data_ptr = FindAlignedPointer(data.first->second.data());
263 std::memcpy(data_ptr, params.data().data(), data_size);
264 soc_interface_AppendConstNode(params.name().c_str(),
265 node_id + NODE_ID_OFFSET, shape_0, shape_1,
266 shape_2, shape_3, data_ptr, data_size);
267 }
268
269 // 2. Setup op nodes
270 for (const GraphTransferNodeInfo& params : graph_transfer_info.node_info()) {
271 const int node_id = params.node_id();
272 const int op_id = params.soc_op_id();
273 CHECK(inputs_map.count(node_id) == 1);
274 CHECK(outputs_map.count(node_id) <= 1);
275 // Only output node doesn't have output
276 const bool has_output = outputs_map.count(node_id) == 1;
277 const auto& input_ptr_and_count = inputs_map.at(node_id);
278 const void* input_ptr = std::get<0>(input_ptr_and_count);
279 const int input_count = std::get<1>(input_ptr_and_count);
280 void* output_ptr = nullptr;
281 int output_count = 0;
282 if (has_output) {
283 const auto& output_ptr_and_count = outputs_map.at(node_id);
284 output_ptr = std::get<0>(output_ptr_and_count);
285 output_count = std::get<1>(output_ptr_and_count);
286 // CHECK(output_count > 0);
287 }
288 int padding_id = -1;
289 if (params.padding_id() == 0) {
290 padding_id = 0;
291 } else if (params.padding_id() == Padding::SAME) {
292 padding_id = 1;
293 } else if (params.padding_id() == Padding::VALID) {
294 padding_id = 2;
295 } else {
296 LOG(FATAL);
297 }
298 soc_interface_AppendNode(params.name().c_str(), node_id + NODE_ID_OFFSET,
299 op_id, padding_id, input_ptr, input_count,
300 output_ptr, output_count);
301 }
302
303 LOG(INFO) << "Setup graph completed";
304
305 // 3. construct graph
306 return soc_interface_ConstructGraph();
307
308 // Keep following comment to use dummy graph construction
309 // return soc_interface_setupDummyGraph(3 /* inception version */);
310 }
311
ExecuteGraph()312 bool HexagonControlWrapper::ExecuteGraph() {
313 return soc_interface_ExecuteGraph();
314 }
315
TeardownGraph()316 bool HexagonControlWrapper::TeardownGraph() {
317 soc_interface_ReleaseNodeInputAndNodeOutputArray();
318 return soc_interface_TeardownGraph();
319 }
320
FillInputNode(const string & node_name,const std::array<int64,GraphTransferer::SHAPE_ARRAY_SIZE> & shape,const ConstByteArray bytes)321 bool HexagonControlWrapper::FillInputNode(
322 const string& node_name,
323 const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE>& shape,
324 const ConstByteArray bytes) {
325 const string tensor_name = AddPort(node_name);
326 CHECK(input_port_map_.count(tensor_name) > 0);
327 const int port = input_port_map_.at(tensor_name);
328 if (input_tensor_data_.count(port) <= 0) {
329 input_tensor_data_.emplace(port, std::vector<uint8>{});
330 }
331 std::vector<uint8>& input_tensor_data = input_tensor_data_.at(port);
332
333 // hexagon only supports 32bit dimension
334 const int x = static_cast<int>(shape[0]);
335 const int y = static_cast<int>(shape[1]);
336 const int z = static_cast<int>(shape[2]);
337 const int d = static_cast<int>(shape[3]);
338
339 const uint64 byte_size = x * y * z * d * DataTypeSize(std::get<2>(bytes));
340 CHECK_EQ(byte_size, std::get<1>(bytes));
341 input_tensor_data.resize(byte_size + ALIGNMENT_BYTES);
342 uint8* data_ptr = FindAlignedPointer(input_tensor_data.data());
343
344 if (DBG_USE_DUMMY_INPUT) {
345 std::memset(data_ptr, 0, byte_size);
346 } else {
347 std::memcpy(data_ptr, std::get<0>(bytes), byte_size);
348 }
349
350 return soc_interface_FillInputNodeWithPort(port, x, y, z, d, data_ptr,
351 byte_size);
352 }
353
ReadOutputNode(const string & node_name,TensorAllocatorFunc tensor_allocator)354 bool HexagonControlWrapper::ReadOutputNode(
355 const string& node_name, TensorAllocatorFunc tensor_allocator) {
356 CHECK_NE(execute_info_, nullptr);
357 TensorShape output_shape;
358 // TODO(satok): Switch shape corresponding to input shape
359 for (int i = 0; i < execute_info_->graph_output_node_name_size(); ++i) {
360 if (execute_info_->graph_output_node_name(i) == node_name) {
361 for (const TensorShapeProto::Dim& dim :
362 execute_info_->default_graph_output_tensor_shape(i).shape().dim()) {
363 output_shape.AddDim(dim.size());
364 }
365 break;
366 }
367 }
368 std::vector<ByteArray> outputs;
369 ReadOutputNode(node_name, &outputs);
370 CHECK_EQ(1, outputs.size());
371 ByteArray& output = outputs[0];
372 Tensor* output_tensor = tensor_allocator(output_shape);
373 CHECK(output_tensor->TotalBytes() >= std::get<1>(output))
374 << output_tensor->TotalBytes() << ", " << std::get<1>(output);
375 TF_CHECK_OK(RemoteFusedGraphExecuteUtils::CopyByteArrayToTensor(
376 std::get<0>(output), std::get<1>(output), output_tensor));
377 return true;
378 }
379
ReadOutputNode(const string & node_name,std::vector<ByteArray> * const outputs)380 bool HexagonControlWrapper::ReadOutputNode(
381 const string& node_name, std::vector<ByteArray>* const outputs) {
382 CHECK(outputs != nullptr);
383 ByteArray output;
384 const string tensor_name = AddPort(node_name);
385 CHECK(output_port_map_.count(tensor_name) > 0);
386 const int port = output_port_map_.at(tensor_name);
387 soc_interface_ReadOutputNodeWithPort(
388 port, &std::get<0>(output),
389 reinterpret_cast<uint64_t*>(&std::get<1>(output)));
390 // TODO: Accept all results
391 // std::get<2>(output) = DT_FLOAT;
392 outputs->emplace_back(output);
393 return true;
394 }
395
FuseRemoteGraph(const GraphDef & original_graph_def,const std::vector<string> & inputs,const std::vector<string> & outputs,GraphDef * fused_graph_def)396 Status HexagonControlWrapper::FuseRemoteGraph(
397 const GraphDef& original_graph_def, const std::vector<string>& inputs,
398 const std::vector<string>& outputs, GraphDef* fused_graph_def) {
399 const std::unordered_set<string> fused_node_names =
400 RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpsDefinitions(
401 original_graph_def, HexagonOpsDefinitions::getInstance());
402 // TODO(satok): We may want to place shape and type inside this function
403 // if they are not placed in the given graph.
404 TF_RETURN_IF_ERROR(RemoteFusedGraphExecuteUtils::FuseRemoteGraphByNodeNames(
405 original_graph_def, inputs, outputs, REMOTE_FUSED_GRAPH_NODE_NAME_PREFIX,
406 fused_node_names, REMOTE_FUSED_GRAPH_EXECUTOR_NAME,
407 /*require_shape_type=*/true, fused_graph_def));
408 return Status::OK();
409 }
410
FillInputNode(const string & node_name,const Tensor & tensor)411 bool HexagonControlWrapper::FillInputNode(const string& node_name,
412 const Tensor& tensor) {
413 StringPiece tensor_data = tensor.tensor_data();
414 const ConstByteArray ba =
415 ConstByteArray(reinterpret_cast<const uint8*>(tensor_data.data()),
416 tensor_data.size(), tensor.dtype());
417 if (DBG_DUMP_INPUT_TENSOR_AS_FLOAT_DATA) {
418 LOG(INFO) << "Input tensor data: element size = " << tensor.NumElements()
419 << ", byte syze = " << tensor.TotalBytes();
420 std::stringstream line;
421 for (int i = 0; i < tensor.NumElements(); ++i) {
422 line << tensor.flat<float>().data()[i] << ", ";
423 if ((i - 2) % 3 == 0 || i == tensor.NumElements() - 1) {
424 LOG(INFO) << "(" << ((i - 2) / 3) << ") " << line.str();
425 line.str("");
426 line.clear();
427 }
428 }
429 }
430 const std::array<int64, GraphTransferer::SHAPE_ARRAY_SIZE> shape =
431 GraphTransferer::ToTensorShapeArray(tensor.shape());
432 FillInputNode(node_name, shape, ba);
433 return true;
434 }
435
IsEnabled() const436 bool HexagonControlWrapper::IsEnabled() const { return true; };
437 } // namespace tensorflow
438