1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "CpuExecutor"
18
19 #include "CpuExecutor.h"
20
21 #include <android-base/scopeguard.h>
22 #include <nnapi/SharedMemory.h>
23 #include <nnapi/TypeUtils.h>
24
25 #include <limits>
26 #include <memory>
27 #include <utility>
28 #include <vector>
29
30 #include "ControlFlow.h"
31 #include "NeuralNetworks.h"
32 #include "OperationResolver.h"
33 #include "Operations.h"
34 #include "OperationsUtils.h"
35 #include "Tracing.h"
36
37 // b/109953668, disable OpenMP
38 #ifdef NNAPI_OPENMP
39 #include <omp.h>
40
41 #include <Eigen/Core>
42 #endif // NNAPI_OPENMP
43
44 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
45 #include "operations/BidirectionalSequenceLSTM.h"
46 #include "operations/Cast.h"
47 #include "operations/EmbeddingLookup.h"
48 #include "operations/ExpandDims.h"
49 #include "operations/HashtableLookup.h"
50 #include "operations/LSHProjection.h"
51 #include "operations/LSTM.h"
52 #include "operations/MaximumMinimum.h"
53 #include "operations/Multinomial.h"
54 #include "operations/Pow.h"
55 #include "operations/QuantizedLSTM.h"
56 #include "operations/RNN.h"
57 #include "operations/SVDF.h"
58 #include "operations/Tile.h"
59 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
60
61 namespace android {
62 namespace nn {
63 namespace {
64
65 class OperationExecutionContext : public IOperationExecutionContext {
66 DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
67
68 public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)69 OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
70 : operation(operation), operands(operands) {}
71
72 uint32_t getNumInputs() const override;
73 OperandType getInputType(uint32_t index) const override;
74 Shape getInputShape(uint32_t index) const override;
75 const void* getInputBuffer(uint32_t index) const override;
76 const Operand::ExtraParams& getInputExtraParams(uint32_t index) const override;
77
78 uint32_t getNumOutputs() const override;
79 OperandType getOutputType(uint32_t index) const override;
80 Shape getOutputShape(uint32_t index) const override;
81 void* getOutputBuffer(uint32_t index) override;
82
83 // Return false on failure and store the result code.
84 // Use getResultCode() to retrieve it at the end of the operation execution.
85 bool setOutputShape(uint32_t index, const Shape& shape) override;
86 int getResultCode() const;
87
88 bool isOmittedInput(uint32_t index) const override;
89 bool isOmittedOutput(uint32_t index) const override;
90
91 // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
92 bool checkNoOmittedOperand() const;
93 // Return false if any of inputs has dimension 0.
94 bool checkNoZeroSizedInput() const;
95
96 private:
97 const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
98 const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
99 RunTimeOperandInfo* getOutputInfo(uint32_t index);
100
101 const Operation* operation;
102 RunTimeOperandInfo* operands;
103
104 int result = ANEURALNETWORKS_NO_ERROR;
105 };
106
getInputInfo(uint32_t index) const107 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
108 CHECK(index < operation->inputs.size());
109 return &operands[operation->inputs[index]];
110 }
111
getOutputInfo(uint32_t index) const112 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
113 CHECK(index < operation->outputs.size());
114 return &operands[operation->outputs[index]];
115 }
116
getOutputInfo(uint32_t index)117 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
118 CHECK(index < operation->outputs.size());
119 return &operands[operation->outputs[index]];
120 }
121
getInputType(uint32_t index) const122 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
123 return getInputInfo(index)->type;
124 }
125
getInputShape(uint32_t index) const126 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
127 return getInputInfo(index)->shape();
128 }
129
getInputBuffer(uint32_t index) const130 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
131 return getInputInfo(index)->buffer;
132 }
133
getInputExtraParams(uint32_t index) const134 const Operand::ExtraParams& OperationExecutionContext::getInputExtraParams(uint32_t index) const {
135 return getInputInfo(index)->extraParams;
136 }
137
getOutputType(uint32_t index) const138 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
139 return getOutputInfo(index)->type;
140 }
141
getOutputShape(uint32_t index) const142 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
143 return getOutputInfo(index)->shape();
144 }
145
getOutputBuffer(uint32_t index)146 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
147 return getOutputInfo(index)->buffer;
148 }
149
getNumInputs() const150 uint32_t OperationExecutionContext::getNumInputs() const {
151 return operation->inputs.size();
152 }
153
getNumOutputs() const154 uint32_t OperationExecutionContext::getNumOutputs() const {
155 return operation->outputs.size();
156 }
157
getResultCode() const158 int OperationExecutionContext::getResultCode() const {
159 return result;
160 }
161
162 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
163 // Updates the RunTimeOperandInfo with the newly calculated shape.
164 // Allocate the buffer if we need to.
165 //
166 // TODO(b/153081229): This function currently cannot handle extension operands well. We need to
167 // propagate the extension type info into this function.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)168 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
169 // For user-provided model output operands, the parameters must match the Shape
170 // calculated from the preparation step.
171 if (info->lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
172 if (info->type != shape.type) {
173 LOG(ERROR) << "Invalid type for model output";
174 *result = ANEURALNETWORKS_OP_FAILED;
175 return false;
176 }
177 if (info->scale != shape.scale) {
178 LOG(ERROR) << "Invalid scale for model output";
179 *result = ANEURALNETWORKS_OP_FAILED;
180 return false;
181 }
182 if (info->zeroPoint != shape.offset) {
183 LOG(ERROR) << "Invalid zeroPoint for model output";
184 *result = ANEURALNETWORKS_OP_FAILED;
185 return false;
186 }
187 if (info->extraParams != shape.extraParams) {
188 LOG(ERROR) << "Invalid extraParams for model output";
189 *result = ANEURALNETWORKS_OP_FAILED;
190 return false;
191 }
192 }
193
194 auto combined = combineDimensions(shape.dimensions, info->dimensions);
195 if (!combined.has_value()) {
196 LOG(ERROR) << "Invalid dimensions for model operand: " << combined.error();
197 *result = ANEURALNETWORKS_OP_FAILED;
198 return false;
199 }
200 info->dimensions = std::move(combined.value());
201 info->type = shape.type;
202 info->scale = shape.scale;
203 info->zeroPoint = shape.offset;
204 info->extraParams = shape.extraParams;
205
206 // TODO(b/153081229): We bypass the overflow check on extension operands because we do not know
207 // the sizes of extension types.
208 if (!isExtension(info->type) &&
209 nonExtensionOperandSizeOfDataOverflowsUInt32(info->type, info->dimensions)) {
210 LOG(ERROR) << "Operand data size overflows uint32_t";
211 *result = ANEURALNETWORKS_OP_FAILED;
212 return false;
213 }
214
215 // Allocate the buffer only if the combined dimension is fully specified
216 if (info->buffer == nullptr && (info->lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
217 info->lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT)) {
218 if (isExtension(info->type)) {
219 LOG(ERROR) << "Cannot allocate a variable of an extension type";
220 *result = ANEURALNETWORKS_OP_FAILED;
221 return false;
222 }
223 uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
224 if (length > 0) {
225 info->buffer = new uint8_t[length];
226 if (info->buffer == nullptr) {
227 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
228 return false;
229 }
230 info->length = length;
231 }
232 }
233 if (!info->isSufficient()) {
234 uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
235 LOG(ERROR) << "Insufficient size for model operand: require = " << length
236 << ", provided = " << info->length;
237 *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
238 return false;
239 }
240 *result = ANEURALNETWORKS_NO_ERROR;
241 return true;
242 }
243
setOutputShape(uint32_t index,const Shape & shape)244 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
245 return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
246 }
247
isOmittedInput(uint32_t index) const248 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
249 return getInputInfo(index)->lifetime == Operand::LifeTime::NO_VALUE;
250 }
251
isOmittedOutput(uint32_t index) const252 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
253 return getOutputInfo(index)->lifetime == Operand::LifeTime::NO_VALUE;
254 }
255
checkNoOmittedOperand() const256 bool OperationExecutionContext::checkNoOmittedOperand() const {
257 for (uint32_t i = 0; i < operation->inputs.size(); i++) {
258 NN_RET_CHECK(!isOmittedInput(i))
259 << operation->type << " input operand " << i << " is required but missing.";
260 }
261 for (uint32_t i = 0; i < operation->outputs.size(); i++) {
262 NN_RET_CHECK(!isOmittedOutput(i))
263 << operation->type << " output operand " << i << " is required but missing.";
264 }
265 return true;
266 }
267
checkNoZeroSizedInput() const268 bool OperationExecutionContext::checkNoZeroSizedInput() const {
269 for (uint32_t i = 0; i < operation->inputs.size(); i++) {
270 if (isOmittedInput(i)) continue;
271 for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
272 NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0)
273 << operation->type << " does not support zero-sized tensor, but input " << i
274 << " dimension " << j << " is 0.";
275 }
276 }
277 return true;
278 }
279
280 } // namespace
281
282 // Used to keep a pointer to a memory pool.
283 //
284 // In the case of an "mmap_fd" pool, owns the mmap region
285 // returned by getBuffer() -- i.e., that region goes away
286 // when the RunTimePoolInfo is destroyed or is assigned to.
287 class RunTimePoolInfo::RunTimePoolInfoImpl {
288 public:
289 RunTimePoolInfoImpl(SharedMemory memory, Mapping mapping);
290
291 uint8_t* getBuffer() const;
292 uint32_t getSize() const;
293
294 bool flush() const;
295
getMemory() const296 const SharedMemory& getMemory() const { return mMemory; }
297
298 private:
299 const SharedMemory mMemory;
300 const Mapping mMapping;
301 };
302
RunTimePoolInfoImpl(SharedMemory memory,Mapping mapping)303 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(SharedMemory memory, Mapping mapping)
304 : mMemory(std::move(memory)), mMapping(std::move(mapping)) {}
305
getBuffer() const306 uint8_t* RunTimePoolInfo::RunTimePoolInfoImpl::getBuffer() const {
307 return std::visit(
308 [](auto* pointer) {
309 // Writing to a const buffer may lead to undefined behavior.
310 // TODO: Refactor the code to avoid the const_cast.
311 return static_cast<uint8_t*>(const_cast<void*>(pointer));
312 },
313 mMapping.pointer);
314 }
315
getSize() const316 uint32_t RunTimePoolInfo::RunTimePoolInfoImpl::getSize() const {
317 CHECK_LE(mMapping.size, std::numeric_limits<uint32_t>::max());
318 return static_cast<uint32_t>(mMapping.size);
319 }
320
321 // Making sure the output data are correctly updated after execution.
flush() const322 bool RunTimePoolInfo::RunTimePoolInfoImpl::flush() const {
323 return nn::flush(mMapping);
324 }
325
326 // TODO: short term, make share memory mapping and updating a utility function.
327 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromMemory(const SharedMemory & memory)328 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromMemory(const SharedMemory& memory) {
329 auto mapping = map(memory);
330 if (!mapping.has_value()) {
331 LOG(ERROR) << "Can't map shared memory: " << mapping.error().message;
332 return std::nullopt;
333 }
334 const auto impl =
335 std::make_shared<const RunTimePoolInfoImpl>(memory, std::move(mapping).value());
336 return RunTimePoolInfo(impl);
337 }
338
createFromExistingBuffer(uint8_t * buffer,uint32_t size)339 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer, uint32_t size) {
340 auto mapping = Mapping{.pointer = buffer, .size = size};
341 const auto impl = std::make_shared<const RunTimePoolInfoImpl>(std::make_shared<const Memory>(),
342 std::move(mapping));
343 return RunTimePoolInfo(impl);
344 }
345
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)346 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
347 : mImpl(impl) {}
348
getBuffer() const349 uint8_t* RunTimePoolInfo::getBuffer() const {
350 return mImpl->getBuffer();
351 }
352
getSize() const353 uint32_t RunTimePoolInfo::getSize() const {
354 return mImpl->getSize();
355 }
356
flush() const357 bool RunTimePoolInfo::flush() const {
358 return mImpl->flush();
359 }
360
getMemory() const361 const SharedMemory& RunTimePoolInfo::getMemory() const {
362 return mImpl->getMemory();
363 }
364
setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo> * poolInfos,const std::vector<SharedMemory> & pools)365 bool setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo>* poolInfos,
366 const std::vector<SharedMemory>& pools) {
367 CHECK(poolInfos != nullptr);
368 poolInfos->clear();
369 poolInfos->reserve(pools.size());
370 for (const auto& pool : pools) {
371 if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromMemory(pool)) {
372 poolInfos->push_back(*poolInfo);
373 } else {
374 LOG(ERROR) << "Could not map pools";
375 poolInfos->clear();
376 return false;
377 }
378 }
379 return true;
380 }
381
setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo> * poolInfos,const std::vector<Request::MemoryPool> & pools)382 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
383 const std::vector<Request::MemoryPool>& pools) {
384 CHECK(poolInfos != nullptr);
385 poolInfos->clear();
386 poolInfos->reserve(pools.size());
387 for (const auto& pool : pools) {
388 if (!std::holds_alternative<SharedMemory>(pool)) {
389 LOG(ERROR) << "Unknown memory token";
390 poolInfos->clear();
391 return false;
392 }
393 if (std::optional<RunTimePoolInfo> poolInfo =
394 RunTimePoolInfo::createFromMemory(std::get<SharedMemory>(pool))) {
395 poolInfos->push_back(*poolInfo);
396 } else {
397 LOG(ERROR) << "Could not map pools";
398 poolInfos->clear();
399 return false;
400 }
401 }
402 return true;
403 }
404
405 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
406 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)407 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
408 uint32_t spatialSize = fromDim[2] * fromDim[3];
409 for (uint32_t n = 0; n < fromDim[0]; n++) {
410 for (uint32_t hw = 0; hw < spatialSize; hw++) {
411 for (uint32_t c = 0; c < fromDim[1]; c++) {
412 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
413 *to++ = from[fromIndex];
414 }
415 }
416 }
417 return true;
418 }
419
420 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)421 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
422 uint32_t spatialSize = fromDim[1] * fromDim[2];
423 for (uint32_t n = 0; n < fromDim[0]; n++) {
424 for (uint32_t c = 0; c < fromDim[3]; c++) {
425 for (uint32_t hw = 0; hw < spatialSize; hw++) {
426 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
427 *to++ = from[fromIndex];
428 }
429 }
430 }
431 return true;
432 }
433
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)434 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
435 std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
436 int result;
437 if (from.dimensions.size() != 4) {
438 LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
439 return false;
440 }
441 to.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
442 if (data_layout) {
443 // convert dimensions
444 Shape inShape = from.shape();
445 auto& fromDim = from.dimensions;
446 inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
447 // allocate buffer
448 to.buffer = nullptr;
449 if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
450 return false;
451 }
452 ptr_guard.reset(to.buffer);
453 // convert value
454 if (from.type == OperandType::TENSOR_FLOAT32) {
455 return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
456 reinterpret_cast<const float*>(from.buffer), fromDim);
457 } else if (from.type == OperandType::TENSOR_FLOAT16) {
458 return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
459 reinterpret_cast<const _Float16*>(from.buffer),
460 fromDim);
461 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
462 return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
463 reinterpret_cast<const uint8_t*>(from.buffer),
464 fromDim);
465 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
466 return convertToNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
467 reinterpret_cast<const int8_t*>(from.buffer), fromDim);
468 } else {
469 LOG(ERROR) << "Unsupported data type";
470 return false;
471 }
472 } else {
473 to = from;
474 }
475 return true;
476 }
477
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)478 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
479 bool data_layout, int* result) {
480 if (from.dimensions.size() != 4) {
481 LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
482 return false;
483 }
484 if (data_layout) {
485 // convert dimensions
486 Shape outShape = from.shape();
487 auto& fromDim = from.dimensions;
488 outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
489 // allocate buffer
490 if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
491 return false;
492 }
493 // convert value
494 if (from.type == OperandType::TENSOR_FLOAT32) {
495 return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
496 reinterpret_cast<const float*>(from.buffer), fromDim);
497 } else if (from.type == OperandType::TENSOR_FLOAT16) {
498 return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
499 reinterpret_cast<const _Float16*>(from.buffer),
500 fromDim);
501 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
502 return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
503 reinterpret_cast<const uint8_t*>(from.buffer),
504 fromDim);
505 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
506 return convertFromNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
507 reinterpret_cast<const int8_t*>(from.buffer),
508 fromDim);
509 } else {
510 LOG(ERROR) << "Unsupported data type";
511 return false;
512 }
513 } else {
514 Shape outShape = from.shape();
515 to.buffer = from.buffer;
516 to.length = from.length;
517 if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
518 return false;
519 }
520 }
521 return true;
522 }
523 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
524
525 // Decrements the usage count for the operands listed. Frees the memory
526 // allocated for any temporary variable with a count of zero.
consumeOperationInputs(const std::vector<uint32_t> & inputs,RunTimeOperandInfo * operands)527 static void consumeOperationInputs(const std::vector<uint32_t>& inputs,
528 RunTimeOperandInfo* operands) {
529 for (uint32_t i : inputs) {
530 auto& info = operands[i];
531 // Check if it's a static or model input/output.
532 if (info.numberOfUsesLeft == 0) {
533 continue;
534 }
535 info.numberOfUsesLeft--;
536 if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
537 delete[] info.buffer;
538 info.buffer = nullptr;
539 }
540 }
541 }
542
543 // This function only frees TEMPORARY_VARIABLE operands that are unused
544 // outputs because consumeOperationInputs takes care of any operands
545 // that are inputs to an operation.
freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo> * operands)546 static void freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo>* operands) {
547 for (auto& info : *operands) {
548 if (info.lifetime == Operand::LifeTime::TEMPORARY_VARIABLE && info.numberOfUsesLeft == 0 &&
549 info.buffer != nullptr) {
550 delete[] info.buffer;
551 info.buffer = nullptr;
552 }
553 }
554 }
555
556 // Ignore the .pools entry in model and request. This will have been taken care of
557 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)558 int CpuExecutor::run(const Model& model, const Request& request,
559 const std::vector<RunTimePoolInfo>& modelPoolInfos,
560 const std::vector<RunTimePoolInfo>& requestPoolInfos) {
561 NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
562 VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(request) << ")";
563 mModelOperandValues = model.operandValues.data();
564 mModelPoolInfos = &modelPoolInfos;
565 mReferencedSubgraphs = &model.referenced;
566
567 // b/109953668, disable OpenMP
568 #ifdef NNAPI_OPENMP
569 ScopedOpenmpSettings openMpSettings;
570 #endif // NNAPI_OPENMP
571
572 std::vector<RunTimeOperandInfo> operands = initializeRunTimeInfo(model.main);
573 updateForArguments(model.main.inputIndexes, request.inputs, requestPoolInfos, operands.data());
574 updateForArguments(model.main.outputIndexes, request.outputs, requestPoolInfos,
575 operands.data());
576 int result = executeSubgraph(model.main, operands.data());
577 freeUnusedSubgraphOperands(&operands);
578
579 if (result == ANEURALNETWORKS_NO_ERROR) {
580 VLOG(CPUEXE) << "Completed run normally";
581 for (auto& runtimeInfo : requestPoolInfos) {
582 runtimeInfo.flush();
583 }
584 }
585
586 // Only report the output shapes when the result code is NO_ERROR or OUTPUT_INSUFFICIENT_SIZE.
587 if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
588 setOutputShapes(model.main.outputIndexes, operands);
589 } else {
590 mOutputShapes.clear();
591 }
592
593 mFinished = true;
594 mModelOperandValues = nullptr;
595 mModelPoolInfos = nullptr;
596 mReferencedSubgraphs = nullptr;
597 return result;
598 }
599
executeSubgraph(const Model::Subgraph & subgraph,RunTimeOperandInfo * operands)600 int CpuExecutor::executeSubgraph(const Model::Subgraph& subgraph, RunTimeOperandInfo* operands) {
601 VLOG(CPUEXE) << "CpuExecutor::executeSubgraph " << subgraph;
602 // The graph has serialized the operation in execution order.
603 for (const auto& operation : subgraph.operations) {
604 NN_RETURN_IF_ERROR(executeOperation(operation, operands));
605 }
606 return ANEURALNETWORKS_NO_ERROR;
607 }
608
initializeRunTimeInfo(const Model::Subgraph & subgraph)609 std::vector<RunTimeOperandInfo> CpuExecutor::initializeRunTimeInfo(
610 const Model::Subgraph& subgraph) {
611 VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
612 const size_t count = subgraph.operands.size();
613 std::vector<RunTimeOperandInfo> operands(count);
614 std::vector<uint32_t> numberOfConsumers =
615 countNumberOfConsumers(count, subgraph.operations).value();
616 for (size_t i = 0; i < count; i++) {
617 const Operand& from = subgraph.operands[i];
618 RunTimeOperandInfo& to = operands[i];
619 to.type = from.type;
620 to.dimensions = from.dimensions;
621 to.scale = from.scale;
622 to.zeroPoint = from.zeroPoint;
623 to.length = from.location.length;
624 to.lifetime = from.lifetime;
625 to.extraParams = from.extraParams;
626 switch (from.lifetime) {
627 case Operand::LifeTime::TEMPORARY_VARIABLE:
628 to.buffer = nullptr;
629 to.numberOfUsesLeft = numberOfConsumers[i];
630 break;
631 case Operand::LifeTime::CONSTANT_COPY:
632 to.buffer = const_cast<uint8_t*>(mModelOperandValues + from.location.offset);
633 to.numberOfUsesLeft = 0;
634 break;
635 case Operand::LifeTime::CONSTANT_REFERENCE: {
636 auto poolIndex = from.location.poolIndex;
637 CHECK_LT(poolIndex, mModelPoolInfos->size());
638 auto& r = (*mModelPoolInfos)[poolIndex];
639 to.buffer = r.getBuffer() + from.location.offset;
640 to.numberOfUsesLeft = 0;
641 break;
642 }
643 case Operand::LifeTime::SUBGRAPH: {
644 auto subgraphIndex = from.location.offset;
645 CHECK_LT(subgraphIndex, mReferencedSubgraphs->size());
646 to.buffer = reinterpret_cast<uint8_t*>(
647 const_cast<Model::Subgraph*>(&(*mReferencedSubgraphs)[subgraphIndex]));
648 to.numberOfUsesLeft = 0;
649 } break;
650 case Operand::LifeTime::POINTER: {
651 to.buffer = reinterpret_cast<uint8_t*>(
652 const_cast<void*>(std::get<const void*>(from.location.pointer)));
653 to.numberOfUsesLeft = 0;
654 } break;
655 case Operand::LifeTime::SUBGRAPH_INPUT:
656 case Operand::LifeTime::SUBGRAPH_OUTPUT:
657 case Operand::LifeTime::NO_VALUE:
658 to.buffer = nullptr;
659 to.numberOfUsesLeft = 0;
660 break;
661 }
662 }
663 return operands;
664 }
665
updateForArguments(const std::vector<uint32_t> & indexes,const std::vector<Request::Argument> & arguments,const std::vector<RunTimePoolInfo> & requestPoolInfos,RunTimeOperandInfo * operands)666 void CpuExecutor::updateForArguments(const std::vector<uint32_t>& indexes,
667 const std::vector<Request::Argument>& arguments,
668 const std::vector<RunTimePoolInfo>& requestPoolInfos,
669 RunTimeOperandInfo* operands) {
670 CHECK_EQ(indexes.size(), arguments.size());
671 for (size_t i = 0; i < indexes.size(); i++) {
672 const uint32_t operandIndex = indexes[i];
673 const Request::Argument& from = arguments[i];
674 RunTimeOperandInfo& to = operands[operandIndex];
675 if (!from.dimensions.empty()) {
676 // It's the responsibility of the caller to validate that
677 // from.dimensions only modifies the dimensions that were
678 // unspecified in the model. That's the case in SampleDriver.cpp
679 // with the call to validateRequest().
680 // TODO make sure that's the case for the default CPU path.
681 to.dimensions = from.dimensions;
682 }
683 switch (from.lifetime) {
684 case Request::Argument::LifeTime::NO_VALUE: {
685 to.lifetime = Operand::LifeTime::NO_VALUE;
686 CHECK(to.buffer == nullptr);
687 to.length = 0;
688 break;
689 }
690 case Request::Argument::LifeTime::POOL: {
691 auto poolIndex = from.location.poolIndex;
692 CHECK_LT(poolIndex, requestPoolInfos.size());
693 auto& r = requestPoolInfos[poolIndex];
694 to.buffer = r.getBuffer() + from.location.offset;
695 if (from.location.offset == 0 && from.location.length == 0) {
696 // Use the entire memory region.
697 to.length = r.getSize();
698 } else {
699 to.length = from.location.length;
700 }
701 break;
702 }
703 case Request::Argument::LifeTime::POINTER: {
704 constexpr auto fn = [](const void* ptr) {
705 return static_cast<const uint8_t*>(ptr);
706 };
707 auto ptr = std::visit(fn, from.location.pointer);
708 // Writing to a const buffer may lead to undefined behavior.
709 // TODO: Refactor the code to avoid the const_cast.
710 to.buffer = const_cast<uint8_t*>(ptr);
711 to.length = from.location.length;
712 break;
713 }
714 }
715 }
716 }
717
executeOperation(const Operation & operation,RunTimeOperandInfo * operands)718 int CpuExecutor::executeOperation(const Operation& operation, RunTimeOperandInfo* operands) {
719 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
720 if (hasDeadlinePassed(mDeadline)) {
721 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
722 }
723 if (operation.type == OperationType::IF) {
724 int result = executeIfOperation(operation, operands);
725 if (result != ANEURALNETWORKS_NO_ERROR) {
726 LOG(ERROR) << "IF failed.";
727 }
728 return result;
729 }
730 if (operation.type == OperationType::WHILE) {
731 int result = executeWhileOperation(operation, operands);
732 if (result != ANEURALNETWORKS_NO_ERROR) {
733 LOG(ERROR) << "WHILE failed.";
734 }
735 return result;
736 }
737
738 // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << operation << ")";
739 const std::vector<uint32_t>& ins = operation.inputs;
740 const std::vector<uint32_t>& outs = operation.outputs;
741 bool success = false;
742 int result = ANEURALNETWORKS_NO_ERROR;
743
744 // Function to verify that the number of input and output parameters
745 // matches what is expected. Also checks that all the parameters have
746 // values. This function is to be used only for operations that do not
747 // accept optional arguments.
748 // TODO Have a version that works for optional arguments.
749 auto allParametersPresent = [&operation, &operands, &ins, &outs](size_t requiredIns,
750 size_t requiredOuts) -> bool {
751 auto verify = [&operation, &operands](size_t requiredCount,
752 const std::vector<uint32_t>& indexes,
753 const char* type) -> bool {
754 size_t actualCount = indexes.size();
755 if (actualCount != requiredCount) {
756 LOG(ERROR) << operation.type << ": Invalid number of " << type << " operands. Got "
757 << actualCount << " of " << requiredCount;
758 return false;
759 }
760 for (size_t i = 0; i < actualCount; i++) {
761 if (operands[indexes[i]].lifetime == Operand::LifeTime::NO_VALUE) {
762 LOG(ERROR) << operation.type << " " << type << " operand " << i
763 << " is required but missing.";
764 return false;
765 }
766 }
767 return true;
768 };
769
770 auto verifyNoZeroSizedInputs = [&operation,
771 &operands](const std::vector<uint32_t>& indexes) {
772 for (size_t i = 0; i < indexes.size(); i++) {
773 for (size_t j = 0; j < operands[indexes[i]].dimensions.size(); j++) {
774 if (operands[indexes[i]].dimensions[j] == 0) {
775 LOG(ERROR) << operation.type
776 << " does not support zero-sized tensor, but input " << i
777 << " dimension " << j << " is zero.";
778 return false;
779 }
780 }
781 }
782 return true;
783 };
784
785 return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
786 verifyNoZeroSizedInputs(ins);
787 };
788
789 switch (operation.type) {
790 case OperationType::OEM_OPERATION: {
791 LOG(ERROR) << "OEM operation not supported for CPU execution";
792 success = false;
793 } break;
794 case OperationType::RESHAPE: {
795 if (!allParametersPresent(2, 1)) {
796 return ANEURALNETWORKS_BAD_DATA;
797 }
798 const RunTimeOperandInfo& input = operands[ins[0]];
799 const RunTimeOperandInfo& targetShape = operands[ins[1]];
800
801 RunTimeOperandInfo& output = operands[outs[0]];
802 Shape outShape = output.shape();
803
804 success = reshapePrepare(input.shape(),
805 reinterpret_cast<const int32_t*>(targetShape.buffer),
806 getNumberOfElements(targetShape.shape()), &outShape) &&
807 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
808 copyData(input.buffer, input.shape(), output.buffer, outShape);
809 } break;
810 case OperationType::DEPTH_TO_SPACE: {
811 const size_t inCount = ins.size();
812 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
813 return ANEURALNETWORKS_BAD_DATA;
814 }
815 const RunTimeOperandInfo& input = operands[ins[0]];
816 int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
817 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
818
819 RunTimeOperandInfo& output = operands[outs[0]];
820 Shape outShape = output.shape();
821
822 RunTimeOperandInfo input_tmp, output_tmp;
823 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
824 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
825 success = false;
826 break;
827 }
828 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
829 output_tmp.buffer = data_layout ? nullptr : output.buffer;
830 output_tmp.length = data_layout ? 0 : output.length;
831 if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
832 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
833 if (!data_layout) output.dimensions = output_tmp.dimensions;
834 break;
835 }
836 switch (input_tmp.type) {
837 case OperandType::TENSOR_FLOAT32: {
838 success = depthToSpaceGeneric(
839 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
840 blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
841 break;
842 }
843 case OperandType::TENSOR_FLOAT16: {
844 success = depthToSpaceGeneric(
845 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
846 blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
847 break;
848 }
849 case OperandType::TENSOR_QUANT8_ASYMM: {
850 success = depthToSpaceGeneric(
851 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
852 blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
853 break;
854 }
855 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
856 success = depthToSpaceGeneric(
857 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
858 blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
859 break;
860 }
861 default: {
862 LOG(ERROR) << "Unsupported data type";
863 success = false;
864 }
865 }
866 if (data_layout) {
867 output_tmp_guard.reset(output_tmp.buffer);
868 }
869 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
870 success = false;
871 break;
872 }
873 } break;
874 case OperationType::SPACE_TO_DEPTH: {
875 const size_t inCount = ins.size();
876 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
877 return ANEURALNETWORKS_BAD_DATA;
878 }
879 const RunTimeOperandInfo& input = operands[ins[0]];
880 int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
881 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
882
883 RunTimeOperandInfo& output = operands[outs[0]];
884 Shape outShape = output.shape();
885
886 RunTimeOperandInfo input_tmp, output_tmp;
887 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
888 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
889 success = false;
890 break;
891 }
892 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
893 output_tmp.buffer = data_layout ? nullptr : output.buffer;
894 output_tmp.length = data_layout ? 0 : output.length;
895
896 if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
897 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
898 if (!data_layout) output.dimensions = output_tmp.dimensions;
899 break;
900 }
901 switch (input_tmp.type) {
902 case OperandType::TENSOR_FLOAT32: {
903 success = spaceToDepthGeneric(
904 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
905 blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
906 break;
907 }
908 case OperandType::TENSOR_FLOAT16: {
909 success = spaceToDepthGeneric(
910 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
911 blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
912 break;
913 }
914 case OperandType::TENSOR_QUANT8_ASYMM: {
915 success = spaceToDepthGeneric(
916 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
917 blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
918 break;
919 }
920 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
921 success = spaceToDepthGeneric(
922 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
923 blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
924 break;
925 }
926 default: {
927 LOG(ERROR) << "Unsupported data type";
928 success = false;
929 }
930 }
931 if (data_layout) {
932 output_tmp_guard.reset(output_tmp.buffer);
933 }
934 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
935 success = false;
936 break;
937 }
938 } break;
939 case OperationType::EMBEDDING_LOOKUP: {
940 if (!allParametersPresent(2, 1)) {
941 return ANEURALNETWORKS_BAD_DATA;
942 }
943 const RunTimeOperandInfo& values = operands[ins[EmbeddingLookup::kValueTensor]];
944 const RunTimeOperandInfo& lookups = operands[ins[EmbeddingLookup::kLookupTensor]];
945 RunTimeOperandInfo& output = operands[outs[EmbeddingLookup::kOutputTensor]];
946
947 Shape outputShape;
948 EmbeddingLookup lookup(operation, operands);
949
950 success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
951 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
952 } break;
953 case OperationType::HASHTABLE_LOOKUP: {
954 if (!allParametersPresent(3, 2)) {
955 return ANEURALNETWORKS_BAD_DATA;
956 }
957 const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
958 const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
959 const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
960
961 RunTimeOperandInfo& output = operands[outs[HashtableLookup::kOutputTensor]];
962 RunTimeOperandInfo& hits = operands[outs[HashtableLookup::kHitsTensor]];
963
964 Shape outputShape, hitShape;
965 HashtableLookup lookup(operation, operands);
966
967 success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
968 &outputShape, &hitShape) &&
969 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
970 setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
971 } break;
972 case OperationType::LSH_PROJECTION: {
973 RunTimeOperandInfo& output = operands[outs[LSHProjection::kOutputTensor]];
974 Shape outputShape;
975 if (!LSHProjection::Prepare(operation, operands, &outputShape) ||
976 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
977 break;
978 }
979
980 LSHProjection lsh(operation, operands);
981 const RunTimeOperandInfo& hash = operands[ins[LSHProjection::kHashTensor]];
982 switch (hash.type) {
983 case OperandType::TENSOR_FLOAT32: {
984 success = lsh.Eval<float>();
985 break;
986 }
987 case OperandType::TENSOR_FLOAT16: {
988 success = lsh.Eval<_Float16>();
989 break;
990 }
991 default: {
992 success = false;
993 LOG(ERROR) << "Unsupported data type";
994 }
995 }
996 } break;
997 case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
998 const auto merge_outputs = getScalarData<bool>(
999 operands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1000 const bool output_state = (outs.size() == 5 || outs.size() == 6);
1001 RunTimeOperandInfo& fwOutput =
1002 operands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1003 Shape fwOutputShape, bwOutputShape, fwOutputActivationStateShape,
1004 fwOutputCellStateShape, bwOutputActivationStateShape, bwOutputCellStateShape;
1005
1006 BidirectionalSequenceLSTM lstm(operation, operands);
1007 success = lstm.Prepare(operation, operands, &fwOutputShape, &bwOutputShape,
1008 &fwOutputActivationStateShape, &fwOutputCellStateShape,
1009 &bwOutputActivationStateShape, &bwOutputCellStateShape) &&
1010 setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1011 if (!merge_outputs) {
1012 RunTimeOperandInfo& bwOutput =
1013 operands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1014 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1015 }
1016 if (output_state) {
1017 uint32_t delta = merge_outputs ? 1 : 0;
1018 RunTimeOperandInfo& fwOutputActivationState =
1019 operands[outs[BidirectionalSequenceLSTM::kFwOutputActivationStateTensor -
1020 delta]];
1021 RunTimeOperandInfo& fwOutputCellState =
1022 operands[outs[BidirectionalSequenceLSTM::kFwOutputCellStateTensor - delta]];
1023 RunTimeOperandInfo& bwOutputActivationState =
1024 operands[outs[BidirectionalSequenceLSTM::kBwOutputActivationStateTensor -
1025 delta]];
1026 RunTimeOperandInfo& bwOutputCellState =
1027 operands[outs[BidirectionalSequenceLSTM::kBwOutputCellStateTensor - delta]];
1028 success = success &&
1029 setInfoAndAllocateIfNeeded(&fwOutputActivationState,
1030 fwOutputActivationStateShape, &result) &&
1031 setInfoAndAllocateIfNeeded(&fwOutputCellState, fwOutputCellStateShape,
1032 &result) &&
1033 setInfoAndAllocateIfNeeded(&bwOutputActivationState,
1034 bwOutputActivationStateShape, &result) &&
1035 setInfoAndAllocateIfNeeded(&bwOutputCellState, bwOutputCellStateShape,
1036 &result);
1037 }
1038 success = success && lstm.Eval();
1039 } break;
1040 case OperationType::LSTM: {
1041 RunTimeOperandInfo& scratch = operands[outs[LSTMCell::kScratchBufferTensor]];
1042 RunTimeOperandInfo& outputStateOut = operands[outs[LSTMCell::kOutputStateOutTensor]];
1043 RunTimeOperandInfo& cellStateOut = operands[outs[LSTMCell::kCellStateOutTensor]];
1044 RunTimeOperandInfo& output = operands[outs[LSTMCell::kOutputTensor]];
1045
1046 Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1047 LSTMCell lstm_cell(operation, operands);
1048
1049 success = lstm_cell.Prepare(operation, operands, &scratchShape, &outputStateShape,
1050 &cellStateShape, &outputShape) &&
1051 setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1052 setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1053 setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1054 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1055 } break;
1056 case OperationType::RANDOM_MULTINOMIAL: {
1057 if (!allParametersPresent(3, 1)) {
1058 return ANEURALNETWORKS_BAD_DATA;
1059 }
1060 RunTimeOperandInfo& output = operands[outs[Multinomial::kOutputTensor]];
1061
1062 Shape outputShape;
1063 Multinomial multinomial(operation, operands);
1064
1065 success = Multinomial::Prepare(operation, operands, &outputShape) &&
1066 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1067 multinomial.Eval();
1068 } break;
1069 case OperationType::RNN: {
1070 if (!allParametersPresent(6, 2)) {
1071 return ANEURALNETWORKS_BAD_DATA;
1072 }
1073
1074 RunTimeOperandInfo& hiddenStateOut = operands[outs[RNN::kHiddenStateOutTensor]];
1075 RunTimeOperandInfo& output = operands[outs[RNN::kOutputTensor]];
1076
1077 Shape hiddenStateShape, outputShape;
1078 RNN rnn_cell(operation, operands);
1079
1080 success = RNN::Prepare(operation, operands, &hiddenStateShape, &outputShape) &&
1081 setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1082 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1083 } break;
1084 case OperationType::SVDF: {
1085 RunTimeOperandInfo& stateOut = operands[outs[SVDF::kStateOutTensor]];
1086 RunTimeOperandInfo& output = operands[outs[SVDF::kOutputTensor]];
1087
1088 Shape stateShape, outputShape;
1089 SVDF svdf(operation, operands);
1090
1091 success = SVDF::Prepare(operation, operands, &stateShape, &outputShape) &&
1092 setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1093 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1094 } break;
1095 case OperationType::BATCH_TO_SPACE_ND: {
1096 const size_t inCount = ins.size();
1097 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1098 return ANEURALNETWORKS_BAD_DATA;
1099 }
1100 const RunTimeOperandInfo& input = operands[ins[0]];
1101 const RunTimeOperandInfo& blockSize = operands[ins[1]];
1102 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
1103
1104 RunTimeOperandInfo& output = operands[outs[0]];
1105 Shape outShape = output.shape();
1106
1107 RunTimeOperandInfo input_tmp, output_tmp;
1108 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1109 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1110 success = false;
1111 break;
1112 }
1113 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1114 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1115 output_tmp.length = data_layout ? 0 : output.length;
1116
1117 if (!batchToSpacePrepare(input_tmp.shape(),
1118 reinterpret_cast<const int32_t*>(blockSize.buffer),
1119 blockSize.shape(), &outShape) ||
1120 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1121 if (!data_layout) output.dimensions = output_tmp.dimensions;
1122 break;
1123 }
1124 switch (input_tmp.type) {
1125 case OperandType::TENSOR_FLOAT32: {
1126 success = batchToSpaceGeneric(
1127 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1128 reinterpret_cast<const int32_t*>(blockSize.buffer),
1129 reinterpret_cast<float*>(output_tmp.buffer), outShape);
1130 break;
1131 }
1132 case OperandType::TENSOR_FLOAT16: {
1133 success = batchToSpaceGeneric(
1134 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1135 reinterpret_cast<const int32_t*>(blockSize.buffer),
1136 reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1137 break;
1138 }
1139 case OperandType::TENSOR_QUANT8_ASYMM: {
1140 success = batchToSpaceGeneric(
1141 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1142 reinterpret_cast<const int32_t*>(blockSize.buffer),
1143 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1144 break;
1145 }
1146 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1147 success = batchToSpaceGeneric(
1148 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1149 reinterpret_cast<const int32_t*>(blockSize.buffer),
1150 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1151 break;
1152 }
1153 default: {
1154 LOG(ERROR) << "Unsupported data type";
1155 success = false;
1156 }
1157 }
1158 if (data_layout) {
1159 output_tmp_guard.reset(output_tmp.buffer);
1160 }
1161 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1162 success = false;
1163 break;
1164 }
1165 } break;
1166 case OperationType::SPACE_TO_BATCH_ND: {
1167 const size_t inCount = ins.size();
1168 if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1169 return ANEURALNETWORKS_BAD_DATA;
1170 }
1171 const RunTimeOperandInfo& input = operands[ins[0]];
1172 const RunTimeOperandInfo& blockSize = operands[ins[1]];
1173 const RunTimeOperandInfo& paddings = operands[ins[2]];
1174 bool data_layout = inCount == 4 ? getScalarData<bool>(operands[ins[3]]) : false;
1175
1176 RunTimeOperandInfo& output = operands[outs[0]];
1177 Shape outShape = output.shape();
1178
1179 RunTimeOperandInfo input_tmp, output_tmp;
1180 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1181 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1182 success = false;
1183 break;
1184 }
1185 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1186 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1187 output_tmp.length = data_layout ? 0 : output.length;
1188
1189 if (!spaceToBatchPrepare(
1190 input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1191 blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1192 paddings.shape(), &outShape) ||
1193 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1194 if (!data_layout) output.dimensions = output_tmp.dimensions;
1195 break;
1196 }
1197 switch (input_tmp.type) {
1198 case OperandType::TENSOR_FLOAT32: {
1199 success = spaceToBatchGeneric(
1200 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1201 reinterpret_cast<const int32_t*>(blockSize.buffer),
1202 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1203 reinterpret_cast<float*>(output_tmp.buffer), outShape);
1204 break;
1205 }
1206 case OperandType::TENSOR_FLOAT16: {
1207 success = spaceToBatchGeneric(
1208 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1209 reinterpret_cast<const int32_t*>(blockSize.buffer),
1210 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1211 reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1212 break;
1213 }
1214 case OperandType::TENSOR_QUANT8_ASYMM: {
1215 success = spaceToBatchGeneric(
1216 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1217 reinterpret_cast<const int32_t*>(blockSize.buffer),
1218 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1219 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1220 break;
1221 }
1222 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1223 success = spaceToBatchGeneric(
1224 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1225 reinterpret_cast<const int32_t*>(blockSize.buffer),
1226 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1227 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1228 break;
1229 }
1230 default: {
1231 LOG(ERROR) << "Unsupported data type";
1232 success = false;
1233 }
1234 }
1235 if (data_layout) {
1236 output_tmp_guard.reset(output_tmp.buffer);
1237 }
1238 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1239 success = false;
1240 break;
1241 }
1242 } break;
1243 case OperationType::PAD:
1244 case OperationType::PAD_V2: {
1245 const bool isV2 = operation.type == OperationType::PAD_V2;
1246 if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1247 return ANEURALNETWORKS_BAD_DATA;
1248 }
1249 const RunTimeOperandInfo& input = operands[ins[0]];
1250 const RunTimeOperandInfo& paddings = operands[ins[1]];
1251
1252 RunTimeOperandInfo& output = operands[outs[0]];
1253 Shape outShape = output.shape();
1254
1255 if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1256 paddings.shape(), &outShape) ||
1257 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1258 break;
1259 }
1260 if (input.type == OperandType::TENSOR_FLOAT32) {
1261 float pad_value = isV2 ? getScalarData<float>(operands[ins[2]]) : 0;
1262 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1263 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1264 reinterpret_cast<float*>(output.buffer), outShape);
1265 } else if (input.type == OperandType::TENSOR_FLOAT16) {
1266 _Float16 pad_value = isV2 ? getScalarData<_Float16>(operands[ins[2]]) : 0;
1267 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1268 reinterpret_cast<const int32_t*>(paddings.buffer),
1269 static_cast<_Float16>(pad_value),
1270 reinterpret_cast<_Float16*>(output.buffer), outShape);
1271 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1272 uint8_t pad_value =
1273 isV2 ? getScalarData<uint8_t>(operands[ins[2]]) : outShape.offset;
1274 success = padGeneric(input.buffer, input.shape(),
1275 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1276 output.buffer, outShape);
1277 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1278 uint8_t pad_value =
1279 isV2 ? getScalarData<int8_t>(operands[ins[2]]) : outShape.offset;
1280 success = padGeneric(input.buffer, input.shape(),
1281 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1282 output.buffer, outShape);
1283 }
1284 } break;
1285 case OperationType::CAST: {
1286 if (!allParametersPresent(1, 1)) {
1287 return ANEURALNETWORKS_BAD_DATA;
1288 }
1289 const RunTimeOperandInfo& input = operands[ins[0]];
1290
1291 RunTimeOperandInfo& output = operands[outs[0]];
1292 Shape outShape = output.shape();
1293
1294 success = cast::prepare(input.shape(), &outShape) &&
1295 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1296 cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1297 } break;
1298 case OperationType::MEAN: {
1299 if (!allParametersPresent(3, 1)) {
1300 return ANEURALNETWORKS_BAD_DATA;
1301 }
1302 const RunTimeOperandInfo& input = operands[ins[0]];
1303 const RunTimeOperandInfo& axis = operands[ins[1]];
1304 int32_t keepDims = getScalarData<int32_t>(operands[ins[2]]);
1305
1306 RunTimeOperandInfo& output = operands[outs[0]];
1307 Shape outShape = output.shape();
1308
1309 if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1310 axis.shape(), keepDims > 0, &outShape) ||
1311 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1312 break;
1313 }
1314 if (input.type == OperandType::TENSOR_FLOAT16) {
1315 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1316 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1317 keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1318 outShape);
1319 } else if (input.type == OperandType::TENSOR_FLOAT32) {
1320 success = meanGeneric<float, float>(
1321 reinterpret_cast<float*>(input.buffer), input.shape(),
1322 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1323 reinterpret_cast<float*>(output.buffer), outShape);
1324 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1325 success = meanGeneric<uint8_t, int32_t>(
1326 reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1327 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1328 reinterpret_cast<uint8_t*>(output.buffer), outShape);
1329 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1330 success = meanGeneric<int8_t, int32_t>(
1331 reinterpret_cast<int8_t*>(input.buffer), input.shape(),
1332 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1333 reinterpret_cast<int8_t*>(output.buffer), outShape);
1334 }
1335 } break;
1336 case OperationType::ARGMAX:
1337 case OperationType::ARGMIN: {
1338 if (!allParametersPresent(2, 1)) {
1339 return ANEURALNETWORKS_BAD_DATA;
1340 }
1341 const RunTimeOperandInfo& input = operands[ins[0]];
1342 int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1343
1344 RunTimeOperandInfo& output = operands[outs[0]];
1345 Shape outShape = output.shape();
1346
1347 const bool isArgMin = operation.type == OperationType::ARGMIN;
1348 success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1349 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1350 argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1351 outShape);
1352 } break;
1353 case OperationType::EXPAND_DIMS: {
1354 if (!allParametersPresent(2, 1)) {
1355 return ANEURALNETWORKS_BAD_DATA;
1356 }
1357 const RunTimeOperandInfo& input = operands[ins[0]];
1358 int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1359
1360 RunTimeOperandInfo& output = operands[outs[0]];
1361 Shape outShape = output.shape();
1362
1363 success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1364 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1365 expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1366 } break;
1367 case OperationType::SPLIT: {
1368 const size_t outCount = outs.size();
1369 if (!allParametersPresent(3, outCount)) {
1370 return ANEURALNETWORKS_BAD_DATA;
1371 }
1372
1373 const RunTimeOperandInfo& input = operands[ins[0]];
1374 const int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1375 const int32_t numOutputs = getScalarData<int32_t>(operands[ins[2]]);
1376
1377 if (numOutputs != outs.size()) {
1378 return ANEURALNETWORKS_BAD_DATA;
1379 }
1380
1381 std::vector<Shape> outputShapes(numOutputs);
1382 for (int i = 0; i < numOutputs; ++i) {
1383 outputShapes[i] = operands[outs[i]].shape();
1384 }
1385
1386 success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1387 for (int i = 0; i < numOutputs; ++i) {
1388 success = success && setInfoAndAllocateIfNeeded(&(operands[outs[i]]),
1389 outputShapes[i], &result);
1390 }
1391 switch (input.type) {
1392 case OperandType::TENSOR_FLOAT16: {
1393 std::vector<_Float16*> outputDataPtrs(numOutputs);
1394 for (int i = 0; i < numOutputs; ++i) {
1395 outputDataPtrs[i] = reinterpret_cast<_Float16*>(operands[outs[i]].buffer);
1396 }
1397 success = success &&
1398 splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1399 input.shape(), axis, &outputDataPtrs, outputShapes);
1400 } break;
1401 case OperandType::TENSOR_FLOAT32: {
1402 std::vector<float*> outputDataPtrs(numOutputs);
1403 for (int i = 0; i < numOutputs; ++i) {
1404 outputDataPtrs[i] = reinterpret_cast<float*>(operands[outs[i]].buffer);
1405 }
1406 success = success &&
1407 splitFloat32(reinterpret_cast<const float*>(input.buffer),
1408 input.shape(), axis, &outputDataPtrs, outputShapes);
1409 } break;
1410 case OperandType::TENSOR_INT32: {
1411 std::vector<int32_t*> outputDataPtrs(numOutputs);
1412 for (int i = 0; i < numOutputs; ++i) {
1413 outputDataPtrs[i] = reinterpret_cast<int32_t*>(operands[outs[i]].buffer);
1414 }
1415 success = success &&
1416 splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1417 input.shape(), axis, &outputDataPtrs, outputShapes);
1418 } break;
1419 case OperandType::TENSOR_QUANT8_ASYMM: {
1420 std::vector<uint8_t*> outputDataPtrs(numOutputs);
1421 for (int i = 0; i < numOutputs; ++i) {
1422 outputDataPtrs[i] = reinterpret_cast<uint8_t*>(operands[outs[i]].buffer);
1423 }
1424 success = success &&
1425 splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1426 input.shape(), axis, &outputDataPtrs, outputShapes);
1427 } break;
1428 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1429 std::vector<int8_t*> outputDataPtrs(numOutputs);
1430 for (int i = 0; i < numOutputs; ++i) {
1431 outputDataPtrs[i] = reinterpret_cast<int8_t*>(operands[outs[i]].buffer);
1432 }
1433 success = success &&
1434 splitQuant8Signed(reinterpret_cast<const int8_t*>(input.buffer),
1435 input.shape(), axis, &outputDataPtrs, outputShapes);
1436 } break;
1437 default: {
1438 return ANEURALNETWORKS_BAD_DATA;
1439 }
1440 }
1441 } break;
1442 case OperationType::MAXIMUM:
1443 case OperationType::MINIMUM: {
1444 if (!allParametersPresent(2, 1)) {
1445 return ANEURALNETWORKS_BAD_DATA;
1446 }
1447 const RunTimeOperandInfo& in1 = operands[ins[0]];
1448 const RunTimeOperandInfo& in2 = operands[ins[1]];
1449
1450 RunTimeOperandInfo& output = operands[outs[0]];
1451 Shape outputShape = output.shape();
1452
1453 const bool isMinimum = operation.type == OperationType::MINIMUM;
1454 success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1455 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1456 maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1457 isMinimum, output.buffer, outputShape);
1458 } break;
1459 case OperationType::GROUPED_CONV_2D: {
1460 const size_t inCount = ins.size();
1461 if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1462 return ANEURALNETWORKS_BAD_DATA;
1463 }
1464 const RunTimeOperandInfo& input = operands[ins[0]];
1465 const RunTimeOperandInfo& filter = operands[ins[1]];
1466 const RunTimeOperandInfo& bias = operands[ins[2]];
1467
1468 int32_t padding_left, padding_right;
1469 int32_t padding_top, padding_bottom;
1470 int32_t padding_implicit = 0;
1471 int32_t stride_width, stride_height;
1472 int32_t numGroups;
1473 int32_t activation;
1474 bool data_layout = false;
1475
1476 if (inCount == 12) {
1477 padding_left = getScalarData<int32_t>(operands[ins[3]]);
1478 padding_right = getScalarData<int32_t>(operands[ins[4]]);
1479 padding_top = getScalarData<int32_t>(operands[ins[5]]);
1480 padding_bottom = getScalarData<int32_t>(operands[ins[6]]);
1481 stride_width = getScalarData<int32_t>(operands[ins[7]]);
1482 stride_height = getScalarData<int32_t>(operands[ins[8]]);
1483 numGroups = getScalarData<int32_t>(operands[ins[9]]);
1484 activation = getScalarData<int32_t>(operands[ins[10]]);
1485 data_layout = getScalarData<bool>(operands[ins[11]]);
1486 } else {
1487 padding_implicit = getScalarData<int32_t>(operands[ins[3]]);
1488 stride_width = getScalarData<int32_t>(operands[ins[4]]);
1489 stride_height = getScalarData<int32_t>(operands[ins[5]]);
1490 numGroups = getScalarData<int32_t>(operands[ins[6]]);
1491 activation = getScalarData<int32_t>(operands[ins[7]]);
1492 data_layout = getScalarData<bool>(operands[ins[8]]);
1493 }
1494
1495 RunTimeOperandInfo& output = operands[outs[0]];
1496 Shape outShape = output.shape();
1497
1498 RunTimeOperandInfo input_tmp, output_tmp;
1499 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1500 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1501 success = false;
1502 break;
1503 }
1504 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1505 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1506 output_tmp.length = data_layout ? 0 : output.length;
1507
1508 if (inCount == 9) {
1509 Shape inputShape = input_tmp.shape();
1510 Shape filterShape = filter.shape();
1511 int32_t input_width = getSizeOfDimension(inputShape, 2);
1512 int32_t input_height = getSizeOfDimension(inputShape, 1);
1513 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1514 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1515 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1516 &padding_left, &padding_right);
1517 calculateExplicitPadding(input_height, stride_height, filter_height,
1518 padding_implicit, &padding_top, &padding_bottom);
1519 }
1520
1521 if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1522 padding_right, padding_top, padding_bottom, stride_width,
1523 stride_height, numGroups, &outShape) ||
1524 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1525 if (!data_layout) output.dimensions = output_tmp.dimensions;
1526 success = false;
1527 break;
1528 }
1529
1530 if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1531 success = groupedConvFloat32(
1532 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1533 reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1534 reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1535 padding_right, padding_top, padding_bottom, stride_width, stride_height,
1536 numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1537 outShape);
1538 } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1539 success = groupedConvFloat16(
1540 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1541 reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1542 reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1543 padding_right, padding_top, padding_bottom, stride_width, stride_height,
1544 numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1545 outShape);
1546 } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1547 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1548 success = groupedConvQuant8PerChannel(
1549 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1550 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1551 std::get<Operand::SymmPerChannelQuantParams>(filter.extraParams)
1552 .scales.data(),
1553 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1554 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1555 stride_height, numGroups, activation,
1556 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1557 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1558 success = groupedConvQuant8(
1559 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1560 reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1561 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1562 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1563 stride_height, numGroups, activation,
1564 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1565 }
1566 } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1567 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1568 success = groupedConvQuant8PerChannel(
1569 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1570 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1571 std::get<Operand::SymmPerChannelQuantParams>(filter.extraParams)
1572 .scales.data(),
1573 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1574 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1575 stride_height, numGroups, activation,
1576 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1577 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1578 success = groupedConvQuant8(
1579 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1580 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1581 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1582 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1583 stride_height, numGroups, activation,
1584 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1585 }
1586 }
1587
1588 if (data_layout) {
1589 output_tmp_guard.reset(output_tmp.buffer);
1590 }
1591 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1592 success = false;
1593 break;
1594 }
1595 } break;
1596 case OperationType::TILE: {
1597 if (!allParametersPresent(2, 1)) {
1598 return ANEURALNETWORKS_BAD_DATA;
1599 }
1600 const RunTimeOperandInfo& input = operands[ins[0]];
1601 const RunTimeOperandInfo& multiples = operands[ins[1]];
1602
1603 RunTimeOperandInfo& output = operands[outs[0]];
1604 Shape outShape = output.shape();
1605
1606 success =
1607 tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1608 multiples.shape(), &outShape) &&
1609 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1610 tile::eval(input.buffer, input.shape(),
1611 reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1612 outShape);
1613 } break;
1614 case OperationType::QUANTIZED_16BIT_LSTM: {
1615 if (!allParametersPresent(15, 2)) {
1616 return ANEURALNETWORKS_BAD_DATA;
1617 }
1618
1619 RunTimeOperandInfo& cellStateOut =
1620 operands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1621 RunTimeOperandInfo& output = operands[outs[QuantizedLSTMCell::kOutputTensor]];
1622
1623 Shape cellStateOutShape, outputShape;
1624 QuantizedLSTMCell quantizedLSTMCell(operation, operands);
1625
1626 success = QuantizedLSTMCell::prepare(operation, operands, &cellStateOutShape,
1627 &outputShape) &&
1628 setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1629 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1630 quantizedLSTMCell.eval();
1631 } break;
1632 case OperationType::POW: {
1633 if (!allParametersPresent(2, 1)) {
1634 return ANEURALNETWORKS_BAD_DATA;
1635 }
1636 const RunTimeOperandInfo& base = operands[ins[0]];
1637 const RunTimeOperandInfo& exponent = operands[ins[1]];
1638
1639 RunTimeOperandInfo& output = operands[outs[0]];
1640 Shape outShape = output.shape();
1641
1642 success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1643 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1644 pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1645 output.buffer, outShape);
1646 } break;
1647 default: {
1648 const OperationRegistration* operationRegistration =
1649 mOperationResolver->findOperation(operation.type);
1650 if (operationRegistration == nullptr) {
1651 LOG(ERROR) << operation.type << " not registered";
1652 } else if (operationRegistration->prepare == nullptr ||
1653 operationRegistration->execute == nullptr) {
1654 LOG(ERROR) << "Incomplete operation registration: " << operation.type;
1655 } else {
1656 OperationExecutionContext context(&operation, operands);
1657 success = operationRegistration->flags.allowOmittedOperand ||
1658 context.checkNoOmittedOperand();
1659 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1660 context.checkNoZeroSizedInput());
1661 success = success && operationRegistration->prepare(&context) &&
1662 operationRegistration->execute(&context);
1663 result = context.getResultCode();
1664 }
1665 }
1666 }
1667 if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1668 result = ANEURALNETWORKS_OP_FAILED;
1669 }
1670 if (result != ANEURALNETWORKS_NO_ERROR) {
1671 LOG(ERROR) << operation.type << " failed.";
1672 }
1673
1674 consumeOperationInputs(ins, operands);
1675 return result;
1676 #else
1677 LOG(ERROR) << "Built without CPU execution support";
1678 return ANEURALNETWORKS_OP_FAILED;
1679 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
1680 }
1681
1682 // Copies RunTimeOperandInfo, preserving the original lifetime and numberOfUsesLeft
1683 // to prevent deallocation of subgraph inputs and outputs.
setInfoExceptLifetime(RunTimeOperandInfo * to,const RunTimeOperandInfo & from)1684 static void setInfoExceptLifetime(RunTimeOperandInfo* to, const RunTimeOperandInfo& from) {
1685 auto originalLifetime = to->lifetime;
1686 auto originalNumberOfUsesLeft = to->numberOfUsesLeft;
1687 *to = from;
1688 to->lifetime = originalLifetime;
1689 to->numberOfUsesLeft = originalNumberOfUsesLeft;
1690 }
1691
executeIfOperation(const Operation & operation,RunTimeOperandInfo * operands)1692 int CpuExecutor::executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1693 namespace op = operation_if;
1694 const RunTimeOperandInfo& condOperand = operands[operation.inputs[op::kCondBoolOperand]];
1695 if (condOperand.buffer == nullptr) {
1696 LOG(ERROR) << "Cannot read IF condition operand value";
1697 return ANEURALNETWORKS_OP_FAILED;
1698 }
1699 const bool condValue = *reinterpret_cast<const bool8*>(condOperand.buffer);
1700 VLOG(CPUEXE) << "CpuExecutor::executeIfOperation: condition value: " << condValue;
1701
1702 const uint32_t branchInputIndex = condValue ? op::kThenModelOperand : op::kElseModelOperand;
1703 const RunTimeOperandInfo& branchOperand = operands[operation.inputs[branchInputIndex]];
1704 const Model::Subgraph& branchSubgraph =
1705 *reinterpret_cast<const Model::Subgraph*>(branchOperand.buffer);
1706 std::vector<RunTimeOperandInfo> branchOperands = initializeRunTimeInfo(branchSubgraph);
1707
1708 // Initialize inner input and output operands from outer operands.
1709 for (uint32_t i = 0, n = branchSubgraph.inputIndexes.size(); i < n; ++i) {
1710 setInfoExceptLifetime(&branchOperands[branchSubgraph.inputIndexes[i]],
1711 operands[operation.inputs[op::kFirstInput + i]]);
1712 }
1713 for (uint32_t i = 0, n = branchSubgraph.outputIndexes.size(); i < n; ++i) {
1714 setInfoExceptLifetime(&branchOperands[branchSubgraph.outputIndexes[i]],
1715 operands[operation.outputs[i]]);
1716 }
1717
1718 NN_RETURN_IF_ERROR(executeSubgraph(branchSubgraph, branchOperands.data()));
1719 freeUnusedSubgraphOperands(&branchOperands);
1720
1721 // Update outer outputs.
1722 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1723 setInfoExceptLifetime(&operands[operation.outputs[i]],
1724 branchOperands[branchSubgraph.outputIndexes[i]]);
1725 }
1726
1727 consumeOperationInputs(operation.inputs, operands);
1728 return ANEURALNETWORKS_NO_ERROR;
1729 }
1730
executeWhileOperation(const Operation & operation,RunTimeOperandInfo * operands)1731 int CpuExecutor::executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1732 namespace op = operation_while;
1733 const RunTimeOperandInfo& condModelOperand = operands[operation.inputs[op::kCondModelOperand]];
1734 const RunTimeOperandInfo& bodyModelOperand = operands[operation.inputs[op::kBodyModelOperand]];
1735 const Model::Subgraph& condSubgraph =
1736 *reinterpret_cast<const Model::Subgraph*>(condModelOperand.buffer);
1737 const Model::Subgraph& bodySubgraph =
1738 *reinterpret_cast<const Model::Subgraph*>(bodyModelOperand.buffer);
1739 std::vector<RunTimeOperandInfo> condOperands = initializeRunTimeInfo(condSubgraph);
1740 std::vector<RunTimeOperandInfo> bodyOperands = initializeRunTimeInfo(bodySubgraph);
1741
1742 // The code below implements the following sequence of subgraph input and output buffer
1743 // assignments:
1744 // iteration = 0 cond inputs = body inputs = outer inputs body outputs = tmp1
1745 // iteration = 1 cond inputs = body inputs = tmp1 body outputs = tmp2
1746 // iteration = 2 cond inputs = body inputs = tmp2 body outputs = tmp1
1747 // iteration = 3 cond inputs = body inputs = ... body outputs = ...
1748
1749 // For body output double buffering.
1750 std::vector<uint8_t*> tmp1(bodySubgraph.outputIndexes.size());
1751 std::vector<uint8_t*> tmp2(bodySubgraph.outputIndexes.size());
1752
1753 // Ensure objects are freed
1754 auto cleanupGuard = base::make_scope_guard(
1755 [&tmp1, &tmp2, &condOperands, &bodyOperands, &operation, &operands] {
1756 auto freeLoopOutputs = [](const std::vector<uint8_t*>& tmp) {
1757 for (auto buffer : tmp) {
1758 if (buffer != nullptr) {
1759 delete[] buffer;
1760 }
1761 }
1762 };
1763
1764 freeLoopOutputs(tmp1);
1765 freeLoopOutputs(tmp2);
1766 freeUnusedSubgraphOperands(&condOperands);
1767 freeUnusedSubgraphOperands(&bodyOperands);
1768 consumeOperationInputs(operation.inputs, operands);
1769 });
1770
1771 // For body outputs with unknown shape, we skip double buffering and
1772 // allocate on each iteration instead. This allows growing output tensors
1773 // inside a WHILE loop.
1774 std::vector<bool> bodyOutputHasUnknownShape(bodySubgraph.outputIndexes.size());
1775 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1776 const Operand& operand = bodySubgraph.operands[bodySubgraph.outputIndexes[i]];
1777 bodyOutputHasUnknownShape[i] = nonExtensionOperandSizeOfData(operand) == 0;
1778 }
1779
1780 // Initialize condition inputs from outer operands.
1781 for (uint32_t i = 0, n = condSubgraph.inputIndexes.size(); i < n; ++i) {
1782 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1783 operands[operation.inputs[op::kFirstInput + i]]);
1784 }
1785
1786 // Store condition output on the stack.
1787 RunTimeOperandInfo& condOutput = condOperands[condSubgraph.outputIndexes[0]];
1788 bool8 condValue = {/* initialized memory */};
1789 condOutput.buffer = &condValue;
1790 condOutput.length = sizeof(condValue);
1791
1792 std::chrono::nanoseconds timeoutDuration(mLoopTimeoutDuration);
1793 const auto startTime = Clock::now();
1794 for (uint32_t iteration = 0;; ++iteration) {
1795 VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: iteration " << iteration;
1796 if (iteration != 0) {
1797 // Set condition inputs from previous iteration outputs.
1798 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1799 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1800 bodyOperands[bodySubgraph.outputIndexes[i]]);
1801 }
1802 }
1803 NN_RETURN_IF_ERROR(executeSubgraph(condSubgraph, condOperands.data()));
1804 VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: condition value: "
1805 << static_cast<int>(condValue);
1806 if (!condValue) {
1807 break;
1808 }
1809
1810 const auto duration = Clock::now() - startTime;
1811 if (duration > timeoutDuration) {
1812 LOG(ERROR) << "CpuExecutor::executeWhileOperation: timed out after "
1813 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1814 << " ms";
1815 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1816 }
1817
1818 // Set body inputs from condition inputs.
1819 for (uint32_t i = 0, n = bodySubgraph.inputIndexes.size(); i < n; ++i) {
1820 bodyOperands[bodySubgraph.inputIndexes[i]] = condOperands[condSubgraph.inputIndexes[i]];
1821 }
1822 // Set body outputs.
1823 auto& outputBuffer = iteration % 2 == 0 ? tmp1 : tmp2;
1824 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1825 RunTimeOperandInfo& info = bodyOperands[bodySubgraph.outputIndexes[i]];
1826 if (bodyOutputHasUnknownShape[i]) {
1827 // Reset dimensions and buffer.
1828 info.dimensions = bodySubgraph.operands[bodySubgraph.outputIndexes[i]].dimensions;
1829 if (outputBuffer[i] != nullptr) {
1830 delete[] outputBuffer[i];
1831 outputBuffer[i] = nullptr;
1832 }
1833 }
1834 info.buffer = outputBuffer[i];
1835 }
1836
1837 NN_RETURN_IF_ERROR(executeSubgraph(bodySubgraph, bodyOperands.data()));
1838
1839 // Update output buffer information in case we have allocated new buffers.
1840 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1841 outputBuffer[i] = bodyOperands[bodySubgraph.outputIndexes[i]].buffer;
1842 }
1843 }
1844
1845 // Copy body outputs to outer outputs.
1846 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1847 RunTimeOperandInfo& outerOperand = operands[operation.outputs[i]];
1848 RunTimeOperandInfo& innerOperand = condOperands[condSubgraph.inputIndexes[i]];
1849 if (int error; !setInfoAndAllocateIfNeeded(&outerOperand, innerOperand.shape(), &error)) {
1850 return error;
1851 }
1852 CHECK_EQ(outerOperand.length, innerOperand.length);
1853 // TODO: Use the outer buffer as tmp1 to avoid copies.
1854 std::memcpy(outerOperand.buffer, innerOperand.buffer, innerOperand.length);
1855 }
1856
1857 return ANEURALNETWORKS_NO_ERROR;
1858 }
1859
setOutputShapes(const std::vector<uint32_t> & outputIndexes,const std::vector<RunTimeOperandInfo> & operands)1860 void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
1861 const std::vector<RunTimeOperandInfo>& operands) {
1862 mOutputShapes.resize(outputIndexes.size());
1863 for (uint32_t i = 0; i < outputIndexes.size(); i++) {
1864 const uint32_t operandIndex = outputIndexes[i];
1865 const RunTimeOperandInfo& from = operands[operandIndex];
1866 mOutputShapes[i].dimensions = from.dimensions;
1867 mOutputShapes[i].isSufficient = from.isSufficient();
1868 VLOG(EXECUTION) << "CpuExecutor::setOutputShapes: mOutputShapes[" << i
1869 << "] = " << mOutputShapes[i];
1870 }
1871 }
1872
1873 // b/109953668, disable OpenMP
1874 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1875 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1876 mBlocktimeInitial = kmp_get_blocktime();
1877 kmp_set_blocktime(20); // ms, see b/109645291
1878
1879 #if NNAPI_LIMIT_CPU_THREADS
1880 // Code not yet enabled. Choosing the number of threads to be based on
1881 // benchmarking. See longer comment by the class declaration.
1882 mMaxThreadsInitial = Eigen::nbThreads();
1883 const int nProcs = omp_get_num_procs();
1884 int threads = nProcs;
1885 if (nProcs >= 8) {
1886 threads = nProcs - 4;
1887 } else if (nProcs >= 4) {
1888 threads = nProcs - 2;
1889 }
1890 Eigen::setNbThreads(threads);
1891 #endif
1892 }
1893
~ScopedOpenmpSettings()1894 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1895 kmp_set_blocktime(mBlocktimeInitial);
1896 #if NNAPI_LIMIT_CPU_THREADS
1897 Eigen::setNbThreads(mMaxThreadsInitial);
1898 #endif
1899 }
1900 #endif // NNAPI_OPENMP
1901
1902 } // namespace nn
1903 } // namespace android
1904