1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "CpuExecutor"
18
19 #include "CpuExecutor.h"
20
21 #include <android-base/scopeguard.h>
22 #include <nnapi/SharedMemory.h>
23 #include <nnapi/TypeUtils.h>
24
25 #include <limits>
26 #include <memory>
27 #include <utility>
28 #include <vector>
29
30 #include "ControlFlow.h"
31 #include "NeuralNetworks.h"
32 #include "OperationResolver.h"
33 #include "Operations.h"
34 #include "OperationsExecutionUtils.h"
35 #include "Tracing.h"
36
37 // b/109953668, disable OpenMP
38 #ifdef NNAPI_OPENMP
39 #include <omp.h>
40
41 #include <Eigen/Core>
42 #endif // NNAPI_OPENMP
43
44 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
45 #include "BidirectionalSequenceLSTM.h"
46 #include "Cast.h"
47 #include "EmbeddingLookup.h"
48 #include "ExpandDims.h"
49 #include "HashtableLookup.h"
50 #include "LSHProjection.h"
51 #include "LSTM.h"
52 #include "MaximumMinimum.h"
53 #include "Multinomial.h"
54 #include "Pow.h"
55 #include "QuantizedLSTM.h"
56 #include "RNN.h"
57 #include "SVDF.h"
58 #include "Tile.h"
59 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
60
61 namespace android {
62 namespace nn {
63 namespace {
64
65 class OperationExecutionContext : public IOperationExecutionContext {
66 DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
67
68 public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)69 OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
70 : operation(operation), operands(operands) {}
71
72 uint32_t getNumInputs() const override;
73 OperandType getInputType(uint32_t index) const override;
74 Shape getInputShape(uint32_t index) const override;
75 const void* getInputBuffer(uint32_t index) const override;
76 const Operand::ExtraParams& getInputExtraParams(uint32_t index) const override;
77
78 uint32_t getNumOutputs() const override;
79 OperandType getOutputType(uint32_t index) const override;
80 Shape getOutputShape(uint32_t index) const override;
81 void* getOutputBuffer(uint32_t index) override;
82
83 // Return false on failure and store the result code.
84 // Use getResultCode() to retrieve it at the end of the operation execution.
85 bool setOutputShape(uint32_t index, const Shape& shape) override;
86 int getResultCode() const;
87
88 bool isOmittedInput(uint32_t index) const override;
89 bool isOmittedOutput(uint32_t index) const override;
90
91 // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
92 bool checkNoOmittedOperand() const;
93 // Return false if any of inputs has dimension 0.
94 bool checkNoZeroSizedInput() const;
95
96 private:
97 const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
98 const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
99 RunTimeOperandInfo* getOutputInfo(uint32_t index);
100
101 const Operation* operation;
102 RunTimeOperandInfo* operands;
103
104 int result = ANEURALNETWORKS_NO_ERROR;
105 };
106
getInputInfo(uint32_t index) const107 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
108 CHECK(index < operation->inputs.size());
109 return &operands[operation->inputs[index]];
110 }
111
getOutputInfo(uint32_t index) const112 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
113 CHECK(index < operation->outputs.size());
114 return &operands[operation->outputs[index]];
115 }
116
getOutputInfo(uint32_t index)117 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
118 CHECK(index < operation->outputs.size());
119 return &operands[operation->outputs[index]];
120 }
121
getInputType(uint32_t index) const122 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
123 return getInputInfo(index)->type;
124 }
125
getInputShape(uint32_t index) const126 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
127 return getInputInfo(index)->shape();
128 }
129
getInputBuffer(uint32_t index) const130 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
131 return getInputInfo(index)->buffer;
132 }
133
getInputExtraParams(uint32_t index) const134 const Operand::ExtraParams& OperationExecutionContext::getInputExtraParams(uint32_t index) const {
135 return getInputInfo(index)->extraParams;
136 }
137
getOutputType(uint32_t index) const138 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
139 return getOutputInfo(index)->type;
140 }
141
getOutputShape(uint32_t index) const142 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
143 return getOutputInfo(index)->shape();
144 }
145
getOutputBuffer(uint32_t index)146 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
147 return getOutputInfo(index)->buffer;
148 }
149
getNumInputs() const150 uint32_t OperationExecutionContext::getNumInputs() const {
151 return operation->inputs.size();
152 }
153
getNumOutputs() const154 uint32_t OperationExecutionContext::getNumOutputs() const {
155 return operation->outputs.size();
156 }
157
getResultCode() const158 int OperationExecutionContext::getResultCode() const {
159 return result;
160 }
161
162 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
163 // Updates the RunTimeOperandInfo with the newly calculated shape.
164 // Allocate the buffer if we need to.
165 //
166 // TODO(b/153081229): This function currently cannot handle extension operands well. We need to
167 // propagate the extension type info into this function.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)168 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
169 // For user-provided model output operands, the parameters must match the Shape
170 // calculated from the preparation step.
171 if (info->lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
172 if (info->type != shape.type) {
173 LOG(ERROR) << "Invalid type for model output";
174 *result = ANEURALNETWORKS_OP_FAILED;
175 return false;
176 }
177 if (info->scale != shape.scale) {
178 LOG(ERROR) << "Invalid scale for model output";
179 *result = ANEURALNETWORKS_OP_FAILED;
180 return false;
181 }
182 if (info->zeroPoint != shape.offset) {
183 LOG(ERROR) << "Invalid zeroPoint for model output";
184 *result = ANEURALNETWORKS_OP_FAILED;
185 return false;
186 }
187 if (info->extraParams != shape.extraParams) {
188 LOG(ERROR) << "Invalid extraParams for model output";
189 *result = ANEURALNETWORKS_OP_FAILED;
190 return false;
191 }
192 }
193
194 auto combined = combineDimensions(shape.dimensions, info->dimensions);
195 if (!combined.has_value()) {
196 LOG(ERROR) << "Invalid dimensions for model operand: " << combined.error();
197 *result = ANEURALNETWORKS_OP_FAILED;
198 return false;
199 }
200 info->dimensions = std::move(combined.value());
201 info->type = shape.type;
202 info->scale = shape.scale;
203 info->zeroPoint = shape.offset;
204 info->extraParams = shape.extraParams;
205
206 // TODO(b/153081229): We bypass the overflow check on extension operands because we do not know
207 // the sizes of extension types.
208 if (!isExtension(info->type) &&
209 nonExtensionOperandSizeOfDataOverflowsUInt32(info->type, info->dimensions)) {
210 LOG(ERROR) << "Operand data size overflows uint32_t";
211 *result = ANEURALNETWORKS_OP_FAILED;
212 return false;
213 }
214
215 // Allocate the buffer only if the combined dimension is fully specified
216 if (info->buffer == nullptr && (info->lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
217 info->lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT)) {
218 if (isExtension(info->type)) {
219 LOG(ERROR) << "Cannot allocate a variable of an extension type";
220 *result = ANEURALNETWORKS_OP_FAILED;
221 return false;
222 }
223 uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
224 if (length > 0) {
225 info->buffer = new uint8_t[length];
226 if (info->buffer == nullptr) {
227 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
228 return false;
229 }
230 info->length = length;
231 }
232 }
233 if (!info->isSufficient()) {
234 uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
235 LOG(ERROR) << "Insufficient size for model operand: require = " << length
236 << ", provided = " << info->length;
237 *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
238 return false;
239 }
240 *result = ANEURALNETWORKS_NO_ERROR;
241 return true;
242 }
243
setOutputShape(uint32_t index,const Shape & shape)244 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
245 return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
246 }
247
isOmittedInput(uint32_t index) const248 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
249 return getInputInfo(index)->lifetime == Operand::LifeTime::NO_VALUE;
250 }
251
isOmittedOutput(uint32_t index) const252 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
253 return getOutputInfo(index)->lifetime == Operand::LifeTime::NO_VALUE;
254 }
255
checkNoOmittedOperand() const256 bool OperationExecutionContext::checkNoOmittedOperand() const {
257 for (uint32_t i = 0; i < operation->inputs.size(); i++) {
258 NN_RET_CHECK(!isOmittedInput(i))
259 << operation->type << " input operand " << i << " is required but missing.";
260 }
261 for (uint32_t i = 0; i < operation->outputs.size(); i++) {
262 NN_RET_CHECK(!isOmittedOutput(i))
263 << operation->type << " output operand " << i << " is required but missing.";
264 }
265 return true;
266 }
267
checkNoZeroSizedInput() const268 bool OperationExecutionContext::checkNoZeroSizedInput() const {
269 for (uint32_t i = 0; i < operation->inputs.size(); i++) {
270 if (isOmittedInput(i)) continue;
271 for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
272 NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0u)
273 << operation->type << " does not support zero-sized tensor, but input " << i
274 << " dimension " << j << " is 0.";
275 }
276 }
277 return true;
278 }
279
280 } // namespace
281
282 // Used to keep a pointer to a memory pool.
283 //
284 // In the case of an "mmap_fd" pool, owns the mmap region
285 // returned by getBuffer() -- i.e., that region goes away
286 // when the RunTimePoolInfo is destroyed or is assigned to.
287 class RunTimePoolInfo::RunTimePoolInfoImpl {
288 public:
289 RunTimePoolInfoImpl(SharedMemory memory, Mapping mapping);
290
291 uint8_t* getBuffer() const;
292 uint32_t getSize() const;
293
294 bool flush() const;
295
getMemory() const296 const SharedMemory& getMemory() const { return mMemory; }
297
298 private:
299 const SharedMemory mMemory;
300 const Mapping mMapping;
301 };
302
RunTimePoolInfoImpl(SharedMemory memory,Mapping mapping)303 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(SharedMemory memory, Mapping mapping)
304 : mMemory(std::move(memory)), mMapping(std::move(mapping)) {}
305
getBuffer() const306 uint8_t* RunTimePoolInfo::RunTimePoolInfoImpl::getBuffer() const {
307 return std::visit(
308 [](auto* pointer) {
309 // Writing to a const buffer may lead to undefined behavior.
310 // TODO: Refactor the code to avoid the const_cast.
311 return static_cast<uint8_t*>(const_cast<void*>(pointer));
312 },
313 mMapping.pointer);
314 }
315
getSize() const316 uint32_t RunTimePoolInfo::RunTimePoolInfoImpl::getSize() const {
317 CHECK_LE(mMapping.size, std::numeric_limits<uint32_t>::max());
318 return static_cast<uint32_t>(mMapping.size);
319 }
320
321 // Making sure the output data are correctly updated after execution.
flush() const322 bool RunTimePoolInfo::RunTimePoolInfoImpl::flush() const {
323 return nn::flush(mMapping);
324 }
325
326 // TODO: short term, make share memory mapping and updating a utility function.
327 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromMemory(const SharedMemory & memory)328 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromMemory(const SharedMemory& memory) {
329 auto mapping = map(memory);
330 if (!mapping.has_value()) {
331 LOG(ERROR) << "Can't map shared memory: " << mapping.error().message;
332 return std::nullopt;
333 }
334 const auto impl =
335 std::make_shared<const RunTimePoolInfoImpl>(memory, std::move(mapping).value());
336 return RunTimePoolInfo(impl);
337 }
338
createFromExistingBuffer(uint8_t * buffer,uint32_t size)339 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer, uint32_t size) {
340 auto mapping = Mapping{.pointer = buffer, .size = size};
341 const auto impl = std::make_shared<const RunTimePoolInfoImpl>(std::make_shared<const Memory>(),
342 std::move(mapping));
343 return RunTimePoolInfo(impl);
344 }
345
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)346 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
347 : mImpl(impl) {}
348
getBuffer() const349 uint8_t* RunTimePoolInfo::getBuffer() const {
350 return mImpl->getBuffer();
351 }
352
getSize() const353 uint32_t RunTimePoolInfo::getSize() const {
354 return mImpl->getSize();
355 }
356
flush() const357 bool RunTimePoolInfo::flush() const {
358 return mImpl->flush();
359 }
360
getMemory() const361 const SharedMemory& RunTimePoolInfo::getMemory() const {
362 return mImpl->getMemory();
363 }
364
setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo> * poolInfos,const std::vector<SharedMemory> & pools)365 bool setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo>* poolInfos,
366 const std::vector<SharedMemory>& pools) {
367 CHECK(poolInfos != nullptr);
368 poolInfos->clear();
369 poolInfos->reserve(pools.size());
370 for (const auto& pool : pools) {
371 if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromMemory(pool)) {
372 poolInfos->push_back(*poolInfo);
373 } else {
374 LOG(ERROR) << "Could not map pools";
375 poolInfos->clear();
376 return false;
377 }
378 }
379 return true;
380 }
381
setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo> * poolInfos,const std::vector<Request::MemoryPool> & pools)382 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
383 const std::vector<Request::MemoryPool>& pools) {
384 CHECK(poolInfos != nullptr);
385 poolInfos->clear();
386 poolInfos->reserve(pools.size());
387 for (const auto& pool : pools) {
388 if (!std::holds_alternative<SharedMemory>(pool)) {
389 LOG(ERROR) << "Unknown memory token";
390 poolInfos->clear();
391 return false;
392 }
393 if (std::optional<RunTimePoolInfo> poolInfo =
394 RunTimePoolInfo::createFromMemory(std::get<SharedMemory>(pool))) {
395 poolInfos->push_back(*poolInfo);
396 } else {
397 LOG(ERROR) << "Could not map pools";
398 poolInfos->clear();
399 return false;
400 }
401 }
402 return true;
403 }
404
405 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
406 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)407 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
408 uint32_t spatialSize = fromDim[2] * fromDim[3];
409 for (uint32_t n = 0; n < fromDim[0]; n++) {
410 for (uint32_t hw = 0; hw < spatialSize; hw++) {
411 for (uint32_t c = 0; c < fromDim[1]; c++) {
412 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
413 *to++ = from[fromIndex];
414 }
415 }
416 }
417 return true;
418 }
419
420 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)421 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
422 uint32_t spatialSize = fromDim[1] * fromDim[2];
423 for (uint32_t n = 0; n < fromDim[0]; n++) {
424 for (uint32_t c = 0; c < fromDim[3]; c++) {
425 for (uint32_t hw = 0; hw < spatialSize; hw++) {
426 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
427 *to++ = from[fromIndex];
428 }
429 }
430 }
431 return true;
432 }
433
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)434 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
435 std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
436 int result;
437 if (from.dimensions.size() != 4) {
438 LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
439 return false;
440 }
441 to.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
442 if (data_layout) {
443 // convert dimensions
444 Shape inShape = from.shape();
445 auto& fromDim = from.dimensions;
446 inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
447 // allocate buffer
448 to.buffer = nullptr;
449 if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
450 return false;
451 }
452 ptr_guard.reset(to.buffer);
453 // convert value
454 if (from.type == OperandType::TENSOR_FLOAT32) {
455 return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
456 reinterpret_cast<const float*>(from.buffer), fromDim);
457 } else if (from.type == OperandType::TENSOR_FLOAT16) {
458 return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
459 reinterpret_cast<const _Float16*>(from.buffer),
460 fromDim);
461 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
462 return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
463 reinterpret_cast<const uint8_t*>(from.buffer),
464 fromDim);
465 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
466 return convertToNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
467 reinterpret_cast<const int8_t*>(from.buffer), fromDim);
468 } else {
469 LOG(ERROR) << "Unsupported data type";
470 return false;
471 }
472 } else {
473 to = from;
474 }
475 return true;
476 }
477
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)478 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
479 bool data_layout, int* result) {
480 if (from.dimensions.size() != 4) {
481 LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
482 return false;
483 }
484 if (data_layout) {
485 // convert dimensions
486 Shape outShape = from.shape();
487 auto& fromDim = from.dimensions;
488 outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
489 // allocate buffer
490 if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
491 return false;
492 }
493 // convert value
494 if (from.type == OperandType::TENSOR_FLOAT32) {
495 return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
496 reinterpret_cast<const float*>(from.buffer), fromDim);
497 } else if (from.type == OperandType::TENSOR_FLOAT16) {
498 return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
499 reinterpret_cast<const _Float16*>(from.buffer),
500 fromDim);
501 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
502 return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
503 reinterpret_cast<const uint8_t*>(from.buffer),
504 fromDim);
505 } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
506 return convertFromNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
507 reinterpret_cast<const int8_t*>(from.buffer),
508 fromDim);
509 } else {
510 LOG(ERROR) << "Unsupported data type";
511 return false;
512 }
513 } else {
514 Shape outShape = from.shape();
515 to.buffer = from.buffer;
516 to.length = from.length;
517 if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
518 return false;
519 }
520 }
521 return true;
522 }
523 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
524
525 // Decrements the usage count for the operands listed. Frees the memory
526 // allocated for any temporary variable with a count of zero.
consumeOperationInputs(const std::vector<uint32_t> & inputs,RunTimeOperandInfo * operands)527 static void consumeOperationInputs(const std::vector<uint32_t>& inputs,
528 RunTimeOperandInfo* operands) {
529 for (uint32_t i : inputs) {
530 auto& info = operands[i];
531 // Check if it's a static or model input/output.
532 if (info.numberOfUsesLeft == 0) {
533 continue;
534 }
535 info.numberOfUsesLeft--;
536 if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
537 delete[] info.buffer;
538 info.buffer = nullptr;
539 }
540 }
541 }
542
543 // This function only frees TEMPORARY_VARIABLE operands that are unused
544 // outputs because consumeOperationInputs takes care of any operands
545 // that are inputs to an operation.
freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo> * operands)546 static void freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo>* operands) {
547 for (auto& info : *operands) {
548 if (info.lifetime == Operand::LifeTime::TEMPORARY_VARIABLE && info.numberOfUsesLeft == 0 &&
549 info.buffer != nullptr) {
550 delete[] info.buffer;
551 info.buffer = nullptr;
552 }
553 }
554 }
555
556 // Ignore the .pools entry in model and request. This will have been taken care of
557 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)558 int CpuExecutor::run(const Model& model, const Request& request,
559 const std::vector<RunTimePoolInfo>& modelPoolInfos,
560 const std::vector<RunTimePoolInfo>& requestPoolInfos) {
561 NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
562 VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(request) << ")";
563 mModelOperandValues = model.operandValues.data();
564 mModelPoolInfos = &modelPoolInfos;
565 mReferencedSubgraphs = &model.referenced;
566
567 // b/109953668, disable OpenMP
568 #ifdef NNAPI_OPENMP
569 ScopedOpenmpSettings openMpSettings;
570 #endif // NNAPI_OPENMP
571
572 std::vector<RunTimeOperandInfo> operands = initializeRunTimeInfo(model.main);
573 updateForArguments(model.main.inputIndexes, request.inputs, requestPoolInfos, operands.data());
574 updateForArguments(model.main.outputIndexes, request.outputs, requestPoolInfos,
575 operands.data());
576 int result = executeSubgraph(model.main, operands.data());
577 freeUnusedSubgraphOperands(&operands);
578
579 if (result == ANEURALNETWORKS_NO_ERROR) {
580 VLOG(CPUEXE) << "Completed run normally";
581 for (auto& runtimeInfo : requestPoolInfos) {
582 runtimeInfo.flush();
583 }
584 }
585
586 // Only report the output shapes when the result code is NO_ERROR or OUTPUT_INSUFFICIENT_SIZE.
587 if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
588 setOutputShapes(model.main.outputIndexes, operands);
589 } else {
590 mOutputShapes.clear();
591 }
592
593 mFinished = true;
594 mModelOperandValues = nullptr;
595 mModelPoolInfos = nullptr;
596 mReferencedSubgraphs = nullptr;
597 return result;
598 }
599
executeSubgraph(const Model::Subgraph & subgraph,RunTimeOperandInfo * operands)600 int CpuExecutor::executeSubgraph(const Model::Subgraph& subgraph, RunTimeOperandInfo* operands) {
601 VLOG(CPUEXE) << "CpuExecutor::executeSubgraph " << subgraph;
602 // The graph has serialized the operation in execution order.
603 for (const auto& operation : subgraph.operations) {
604 NN_RETURN_IF_ERROR(executeOperation(operation, operands));
605 }
606 return ANEURALNETWORKS_NO_ERROR;
607 }
608
initializeRunTimeInfo(const Model::Subgraph & subgraph)609 std::vector<RunTimeOperandInfo> CpuExecutor::initializeRunTimeInfo(
610 const Model::Subgraph& subgraph) {
611 VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
612 const size_t count = subgraph.operands.size();
613 std::vector<RunTimeOperandInfo> operands(count);
614 std::vector<uint32_t> numberOfConsumers =
615 countNumberOfConsumers(count, subgraph.operations).value();
616 for (size_t i = 0; i < count; i++) {
617 const Operand& from = subgraph.operands[i];
618 RunTimeOperandInfo& to = operands[i];
619 to.type = from.type;
620 to.dimensions = from.dimensions;
621 to.scale = from.scale;
622 to.zeroPoint = from.zeroPoint;
623 to.length = from.location.length;
624 to.lifetime = from.lifetime;
625 to.extraParams = from.extraParams;
626 switch (from.lifetime) {
627 case Operand::LifeTime::TEMPORARY_VARIABLE:
628 to.buffer = nullptr;
629 to.numberOfUsesLeft = numberOfConsumers[i];
630 break;
631 case Operand::LifeTime::CONSTANT_COPY:
632 to.buffer = const_cast<uint8_t*>(mModelOperandValues + from.location.offset);
633 to.numberOfUsesLeft = 0;
634 break;
635 case Operand::LifeTime::CONSTANT_REFERENCE: {
636 auto poolIndex = from.location.poolIndex;
637 CHECK_LT(poolIndex, mModelPoolInfos->size());
638 auto& r = (*mModelPoolInfos)[poolIndex];
639 to.buffer = r.getBuffer() + from.location.offset;
640 to.numberOfUsesLeft = 0;
641 break;
642 }
643 case Operand::LifeTime::SUBGRAPH: {
644 auto subgraphIndex = from.location.offset;
645 CHECK_LT(subgraphIndex, mReferencedSubgraphs->size());
646 to.buffer = reinterpret_cast<uint8_t*>(
647 const_cast<Model::Subgraph*>(&(*mReferencedSubgraphs)[subgraphIndex]));
648 to.numberOfUsesLeft = 0;
649 } break;
650 case Operand::LifeTime::POINTER: {
651 to.buffer = reinterpret_cast<uint8_t*>(
652 const_cast<void*>(std::get<const void*>(from.location.pointer)));
653 to.numberOfUsesLeft = 0;
654 } break;
655 case Operand::LifeTime::SUBGRAPH_INPUT:
656 case Operand::LifeTime::SUBGRAPH_OUTPUT:
657 case Operand::LifeTime::NO_VALUE:
658 to.buffer = nullptr;
659 to.numberOfUsesLeft = 0;
660 break;
661 }
662 }
663 return operands;
664 }
665
updateForArguments(const std::vector<uint32_t> & indexes,const std::vector<Request::Argument> & arguments,const std::vector<RunTimePoolInfo> & requestPoolInfos,RunTimeOperandInfo * operands)666 void CpuExecutor::updateForArguments(const std::vector<uint32_t>& indexes,
667 const std::vector<Request::Argument>& arguments,
668 const std::vector<RunTimePoolInfo>& requestPoolInfos,
669 RunTimeOperandInfo* operands) {
670 CHECK_EQ(indexes.size(), arguments.size());
671 for (size_t i = 0; i < indexes.size(); i++) {
672 const uint32_t operandIndex = indexes[i];
673 const Request::Argument& from = arguments[i];
674 RunTimeOperandInfo& to = operands[operandIndex];
675 if (!from.dimensions.empty()) {
676 // It's the responsibility of the caller to validate that
677 // from.dimensions only modifies the dimensions that were
678 // unspecified in the model. That's the case in SampleDriver.cpp
679 // with the call to validateRequest().
680 // TODO make sure that's the case for the default CPU path.
681 to.dimensions = from.dimensions;
682 }
683 switch (from.lifetime) {
684 case Request::Argument::LifeTime::NO_VALUE: {
685 to.lifetime = Operand::LifeTime::NO_VALUE;
686 CHECK(to.buffer == nullptr);
687 to.length = 0;
688 break;
689 }
690 case Request::Argument::LifeTime::POOL: {
691 auto poolIndex = from.location.poolIndex;
692 CHECK_LT(poolIndex, requestPoolInfos.size());
693 auto& r = requestPoolInfos[poolIndex];
694 to.buffer = r.getBuffer() + from.location.offset;
695 if (from.location.offset == 0 && from.location.length == 0) {
696 // Use the entire memory region.
697 to.length = r.getSize();
698 } else {
699 to.length = from.location.length;
700 }
701 break;
702 }
703 case Request::Argument::LifeTime::POINTER: {
704 constexpr auto fn = [](const void* ptr) {
705 return static_cast<const uint8_t*>(ptr);
706 };
707 auto ptr = std::visit(fn, from.location.pointer);
708 // Writing to a const buffer may lead to undefined behavior.
709 // TODO: Refactor the code to avoid the const_cast.
710 to.buffer = const_cast<uint8_t*>(ptr);
711 to.length = from.location.length;
712 break;
713 }
714 }
715 }
716 }
717
executeOperation(const Operation & operation,RunTimeOperandInfo * operands)718 int CpuExecutor::executeOperation([[maybe_unused]] const Operation& operation,
719 [[maybe_unused]] RunTimeOperandInfo* operands) {
720 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
721 if (hasDeadlinePassed(mDeadline)) {
722 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
723 }
724 if (operation.type == OperationType::IF) {
725 int result = executeIfOperation(operation, operands);
726 if (result != ANEURALNETWORKS_NO_ERROR) {
727 LOG(ERROR) << "IF failed.";
728 }
729 return result;
730 }
731 if (operation.type == OperationType::WHILE) {
732 int result = executeWhileOperation(operation, operands);
733 if (result != ANEURALNETWORKS_NO_ERROR) {
734 LOG(ERROR) << "WHILE failed.";
735 }
736 return result;
737 }
738
739 // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << operation << ")";
740 const std::vector<uint32_t>& ins = operation.inputs;
741 const std::vector<uint32_t>& outs = operation.outputs;
742 bool success = false;
743 int result = ANEURALNETWORKS_NO_ERROR;
744
745 // Function to verify that the number of input and output parameters
746 // matches what is expected. Also checks that all the parameters have
747 // values. This function is to be used only for operations that do not
748 // accept optional arguments.
749 // TODO Have a version that works for optional arguments.
750 auto allParametersPresent = [&operation, &operands, &ins, &outs](size_t requiredIns,
751 size_t requiredOuts) -> bool {
752 auto verify = [&operation, &operands](size_t requiredCount,
753 const std::vector<uint32_t>& indexes,
754 const char* type) -> bool {
755 size_t actualCount = indexes.size();
756 if (actualCount != requiredCount) {
757 LOG(ERROR) << operation.type << ": Invalid number of " << type << " operands. Got "
758 << actualCount << " of " << requiredCount;
759 return false;
760 }
761 for (size_t i = 0; i < actualCount; i++) {
762 if (operands[indexes[i]].lifetime == Operand::LifeTime::NO_VALUE) {
763 LOG(ERROR) << operation.type << " " << type << " operand " << i
764 << " is required but missing.";
765 return false;
766 }
767 }
768 return true;
769 };
770
771 auto verifyNoZeroSizedInputs = [&operation,
772 &operands](const std::vector<uint32_t>& indexes) {
773 for (size_t i = 0; i < indexes.size(); i++) {
774 for (size_t j = 0; j < operands[indexes[i]].dimensions.size(); j++) {
775 if (operands[indexes[i]].dimensions[j] == 0) {
776 LOG(ERROR) << operation.type
777 << " does not support zero-sized tensor, but input " << i
778 << " dimension " << j << " is zero.";
779 return false;
780 }
781 }
782 }
783 return true;
784 };
785
786 return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
787 verifyNoZeroSizedInputs(ins);
788 };
789
790 switch (operation.type) {
791 case OperationType::OEM_OPERATION: {
792 LOG(ERROR) << "OEM operation not supported for CPU execution";
793 success = false;
794 } break;
795 case OperationType::RESHAPE: {
796 if (!allParametersPresent(2, 1)) {
797 return ANEURALNETWORKS_BAD_DATA;
798 }
799 const RunTimeOperandInfo& input = operands[ins[0]];
800 const RunTimeOperandInfo& targetShape = operands[ins[1]];
801
802 RunTimeOperandInfo& output = operands[outs[0]];
803 Shape outShape = output.shape();
804
805 success = reshapePrepare(input.shape(),
806 reinterpret_cast<const int32_t*>(targetShape.buffer),
807 getNumberOfElements(targetShape.shape()), &outShape) &&
808 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
809 copyData(input.buffer, input.shape(), output.buffer, outShape);
810 } break;
811 case OperationType::DEPTH_TO_SPACE: {
812 const size_t inCount = ins.size();
813 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
814 return ANEURALNETWORKS_BAD_DATA;
815 }
816 const RunTimeOperandInfo& input = operands[ins[0]];
817 int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
818 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
819
820 RunTimeOperandInfo& output = operands[outs[0]];
821 Shape outShape = output.shape();
822
823 RunTimeOperandInfo input_tmp, output_tmp;
824 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
825 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
826 success = false;
827 break;
828 }
829 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
830 output_tmp.buffer = data_layout ? nullptr : output.buffer;
831 output_tmp.length = data_layout ? 0 : output.length;
832 if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
833 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
834 if (!data_layout) output.dimensions = output_tmp.dimensions;
835 break;
836 }
837 switch (input_tmp.type) {
838 case OperandType::TENSOR_FLOAT32: {
839 success = depthToSpaceGeneric(
840 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
841 blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
842 break;
843 }
844 case OperandType::TENSOR_FLOAT16: {
845 success = depthToSpaceGeneric(
846 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
847 blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
848 break;
849 }
850 case OperandType::TENSOR_QUANT8_ASYMM: {
851 success = depthToSpaceGeneric(
852 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
853 blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
854 break;
855 }
856 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
857 success = depthToSpaceGeneric(
858 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
859 blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
860 break;
861 }
862 default: {
863 LOG(ERROR) << "Unsupported data type";
864 success = false;
865 }
866 }
867 if (data_layout) {
868 output_tmp_guard.reset(output_tmp.buffer);
869 }
870 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
871 success = false;
872 break;
873 }
874 } break;
875 case OperationType::SPACE_TO_DEPTH: {
876 const size_t inCount = ins.size();
877 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
878 return ANEURALNETWORKS_BAD_DATA;
879 }
880 const RunTimeOperandInfo& input = operands[ins[0]];
881 int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
882 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
883
884 RunTimeOperandInfo& output = operands[outs[0]];
885 Shape outShape = output.shape();
886
887 RunTimeOperandInfo input_tmp, output_tmp;
888 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
889 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
890 success = false;
891 break;
892 }
893 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
894 output_tmp.buffer = data_layout ? nullptr : output.buffer;
895 output_tmp.length = data_layout ? 0 : output.length;
896
897 if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
898 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
899 if (!data_layout) output.dimensions = output_tmp.dimensions;
900 break;
901 }
902 switch (input_tmp.type) {
903 case OperandType::TENSOR_FLOAT32: {
904 success = spaceToDepthGeneric(
905 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
906 blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
907 break;
908 }
909 case OperandType::TENSOR_FLOAT16: {
910 success = spaceToDepthGeneric(
911 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
912 blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
913 break;
914 }
915 case OperandType::TENSOR_QUANT8_ASYMM: {
916 success = spaceToDepthGeneric(
917 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
918 blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
919 break;
920 }
921 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
922 success = spaceToDepthGeneric(
923 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
924 blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
925 break;
926 }
927 default: {
928 LOG(ERROR) << "Unsupported data type";
929 success = false;
930 }
931 }
932 if (data_layout) {
933 output_tmp_guard.reset(output_tmp.buffer);
934 }
935 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
936 success = false;
937 break;
938 }
939 } break;
940 case OperationType::EMBEDDING_LOOKUP: {
941 if (!allParametersPresent(2, 1)) {
942 return ANEURALNETWORKS_BAD_DATA;
943 }
944 const RunTimeOperandInfo& values = operands[ins[EmbeddingLookup::kValueTensor]];
945 const RunTimeOperandInfo& lookups = operands[ins[EmbeddingLookup::kLookupTensor]];
946 RunTimeOperandInfo& output = operands[outs[EmbeddingLookup::kOutputTensor]];
947
948 Shape outputShape;
949 EmbeddingLookup lookup(operation, operands);
950
951 success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
952 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
953 } break;
954 case OperationType::HASHTABLE_LOOKUP: {
955 if (!allParametersPresent(3, 2)) {
956 return ANEURALNETWORKS_BAD_DATA;
957 }
958 const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
959 const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
960 const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
961
962 RunTimeOperandInfo& output = operands[outs[HashtableLookup::kOutputTensor]];
963 RunTimeOperandInfo& hits = operands[outs[HashtableLookup::kHitsTensor]];
964
965 Shape outputShape, hitShape;
966 HashtableLookup lookup(operation, operands);
967
968 success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
969 &outputShape, &hitShape) &&
970 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
971 setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
972 } break;
973 case OperationType::LSH_PROJECTION: {
974 RunTimeOperandInfo& output = operands[outs[LSHProjection::kOutputTensor]];
975 Shape outputShape;
976 if (!LSHProjection::Prepare(operation, operands, &outputShape) ||
977 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
978 break;
979 }
980
981 LSHProjection lsh(operation, operands);
982 const RunTimeOperandInfo& hash = operands[ins[LSHProjection::kHashTensor]];
983 switch (hash.type) {
984 case OperandType::TENSOR_FLOAT32: {
985 success = lsh.Eval<float>();
986 break;
987 }
988 case OperandType::TENSOR_FLOAT16: {
989 success = lsh.Eval<_Float16>();
990 break;
991 }
992 default: {
993 success = false;
994 LOG(ERROR) << "Unsupported data type";
995 }
996 }
997 } break;
998 case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
999 const auto merge_outputs = getScalarData<bool>(
1000 operands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1001 const bool output_state = (outs.size() == 5 || outs.size() == 6);
1002 RunTimeOperandInfo& fwOutput =
1003 operands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1004 Shape fwOutputShape, bwOutputShape, fwOutputActivationStateShape,
1005 fwOutputCellStateShape, bwOutputActivationStateShape, bwOutputCellStateShape;
1006
1007 BidirectionalSequenceLSTM lstm(operation, operands);
1008 success = lstm.Prepare(operation, operands, &fwOutputShape, &bwOutputShape,
1009 &fwOutputActivationStateShape, &fwOutputCellStateShape,
1010 &bwOutputActivationStateShape, &bwOutputCellStateShape) &&
1011 setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1012 if (!merge_outputs) {
1013 RunTimeOperandInfo& bwOutput =
1014 operands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1015 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1016 }
1017 if (output_state) {
1018 uint32_t delta = merge_outputs ? 1 : 0;
1019 RunTimeOperandInfo& fwOutputActivationState =
1020 operands[outs[BidirectionalSequenceLSTM::kFwOutputActivationStateTensor -
1021 delta]];
1022 RunTimeOperandInfo& fwOutputCellState =
1023 operands[outs[BidirectionalSequenceLSTM::kFwOutputCellStateTensor - delta]];
1024 RunTimeOperandInfo& bwOutputActivationState =
1025 operands[outs[BidirectionalSequenceLSTM::kBwOutputActivationStateTensor -
1026 delta]];
1027 RunTimeOperandInfo& bwOutputCellState =
1028 operands[outs[BidirectionalSequenceLSTM::kBwOutputCellStateTensor - delta]];
1029 success = success &&
1030 setInfoAndAllocateIfNeeded(&fwOutputActivationState,
1031 fwOutputActivationStateShape, &result) &&
1032 setInfoAndAllocateIfNeeded(&fwOutputCellState, fwOutputCellStateShape,
1033 &result) &&
1034 setInfoAndAllocateIfNeeded(&bwOutputActivationState,
1035 bwOutputActivationStateShape, &result) &&
1036 setInfoAndAllocateIfNeeded(&bwOutputCellState, bwOutputCellStateShape,
1037 &result);
1038 }
1039 success = success && lstm.Eval();
1040 } break;
1041 case OperationType::LSTM: {
1042 RunTimeOperandInfo& scratch = operands[outs[LSTMCell::kScratchBufferTensor]];
1043 RunTimeOperandInfo& outputStateOut = operands[outs[LSTMCell::kOutputStateOutTensor]];
1044 RunTimeOperandInfo& cellStateOut = operands[outs[LSTMCell::kCellStateOutTensor]];
1045 RunTimeOperandInfo& output = operands[outs[LSTMCell::kOutputTensor]];
1046
1047 Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1048 LSTMCell lstm_cell(operation, operands);
1049
1050 success = lstm_cell.Prepare(operation, operands, &scratchShape, &outputStateShape,
1051 &cellStateShape, &outputShape) &&
1052 setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1053 setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1054 setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1055 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1056 } break;
1057 case OperationType::RANDOM_MULTINOMIAL: {
1058 if (!allParametersPresent(3, 1)) {
1059 return ANEURALNETWORKS_BAD_DATA;
1060 }
1061 RunTimeOperandInfo& output = operands[outs[Multinomial::kOutputTensor]];
1062
1063 Shape outputShape;
1064 Multinomial multinomial(operation, operands);
1065
1066 success = Multinomial::Prepare(operation, operands, &outputShape) &&
1067 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1068 multinomial.Eval();
1069 } break;
1070 case OperationType::RNN: {
1071 if (!allParametersPresent(6, 2)) {
1072 return ANEURALNETWORKS_BAD_DATA;
1073 }
1074
1075 RunTimeOperandInfo& hiddenStateOut = operands[outs[RNN::kHiddenStateOutTensor]];
1076 RunTimeOperandInfo& output = operands[outs[RNN::kOutputTensor]];
1077
1078 Shape hiddenStateShape, outputShape;
1079 RNN rnn_cell(operation, operands);
1080
1081 success = RNN::Prepare(operation, operands, &hiddenStateShape, &outputShape) &&
1082 setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1083 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1084 } break;
1085 case OperationType::SVDF: {
1086 RunTimeOperandInfo& stateOut = operands[outs[SVDF::kStateOutTensor]];
1087 RunTimeOperandInfo& output = operands[outs[SVDF::kOutputTensor]];
1088
1089 Shape stateShape, outputShape;
1090 SVDF svdf(operation, operands);
1091
1092 success = SVDF::Prepare(operation, operands, &stateShape, &outputShape) &&
1093 setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1094 setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1095 } break;
1096 case OperationType::BATCH_TO_SPACE_ND: {
1097 const size_t inCount = ins.size();
1098 if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1099 return ANEURALNETWORKS_BAD_DATA;
1100 }
1101 const RunTimeOperandInfo& input = operands[ins[0]];
1102 const RunTimeOperandInfo& blockSize = operands[ins[1]];
1103 bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
1104
1105 RunTimeOperandInfo& output = operands[outs[0]];
1106 Shape outShape = output.shape();
1107
1108 RunTimeOperandInfo input_tmp, output_tmp;
1109 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1110 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1111 success = false;
1112 break;
1113 }
1114 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1115 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1116 output_tmp.length = data_layout ? 0 : output.length;
1117
1118 if (!batchToSpacePrepare(input_tmp.shape(),
1119 reinterpret_cast<const int32_t*>(blockSize.buffer),
1120 blockSize.shape(), &outShape) ||
1121 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1122 if (!data_layout) output.dimensions = output_tmp.dimensions;
1123 break;
1124 }
1125 switch (input_tmp.type) {
1126 case OperandType::TENSOR_FLOAT32: {
1127 success = batchToSpaceGeneric(
1128 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1129 reinterpret_cast<const int32_t*>(blockSize.buffer),
1130 reinterpret_cast<float*>(output_tmp.buffer), outShape);
1131 break;
1132 }
1133 case OperandType::TENSOR_FLOAT16: {
1134 success = batchToSpaceGeneric(
1135 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1136 reinterpret_cast<const int32_t*>(blockSize.buffer),
1137 reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1138 break;
1139 }
1140 case OperandType::TENSOR_QUANT8_ASYMM: {
1141 success = batchToSpaceGeneric(
1142 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1143 reinterpret_cast<const int32_t*>(blockSize.buffer),
1144 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1145 break;
1146 }
1147 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1148 success = batchToSpaceGeneric(
1149 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1150 reinterpret_cast<const int32_t*>(blockSize.buffer),
1151 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1152 break;
1153 }
1154 default: {
1155 LOG(ERROR) << "Unsupported data type";
1156 success = false;
1157 }
1158 }
1159 if (data_layout) {
1160 output_tmp_guard.reset(output_tmp.buffer);
1161 }
1162 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1163 success = false;
1164 break;
1165 }
1166 } break;
1167 case OperationType::SPACE_TO_BATCH_ND: {
1168 const size_t inCount = ins.size();
1169 if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1170 return ANEURALNETWORKS_BAD_DATA;
1171 }
1172 const RunTimeOperandInfo& input = operands[ins[0]];
1173 const RunTimeOperandInfo& blockSize = operands[ins[1]];
1174 const RunTimeOperandInfo& paddings = operands[ins[2]];
1175 bool data_layout = inCount == 4 ? getScalarData<bool>(operands[ins[3]]) : false;
1176
1177 RunTimeOperandInfo& output = operands[outs[0]];
1178 Shape outShape = output.shape();
1179
1180 RunTimeOperandInfo input_tmp, output_tmp;
1181 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1182 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1183 success = false;
1184 break;
1185 }
1186 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1187 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1188 output_tmp.length = data_layout ? 0 : output.length;
1189
1190 if (!spaceToBatchPrepare(
1191 input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1192 blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1193 paddings.shape(), &outShape) ||
1194 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1195 if (!data_layout) output.dimensions = output_tmp.dimensions;
1196 break;
1197 }
1198 switch (input_tmp.type) {
1199 case OperandType::TENSOR_FLOAT32: {
1200 success = spaceToBatchGeneric(
1201 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1202 reinterpret_cast<const int32_t*>(blockSize.buffer),
1203 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1204 reinterpret_cast<float*>(output_tmp.buffer), outShape);
1205 break;
1206 }
1207 case OperandType::TENSOR_FLOAT16: {
1208 success = spaceToBatchGeneric(
1209 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1210 reinterpret_cast<const int32_t*>(blockSize.buffer),
1211 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1212 reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1213 break;
1214 }
1215 case OperandType::TENSOR_QUANT8_ASYMM: {
1216 success = spaceToBatchGeneric(
1217 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1218 reinterpret_cast<const int32_t*>(blockSize.buffer),
1219 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1220 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1221 break;
1222 }
1223 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1224 success = spaceToBatchGeneric(
1225 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1226 reinterpret_cast<const int32_t*>(blockSize.buffer),
1227 reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1228 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1229 break;
1230 }
1231 default: {
1232 LOG(ERROR) << "Unsupported data type";
1233 success = false;
1234 }
1235 }
1236 if (data_layout) {
1237 output_tmp_guard.reset(output_tmp.buffer);
1238 }
1239 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1240 success = false;
1241 break;
1242 }
1243 } break;
1244 case OperationType::PAD:
1245 case OperationType::PAD_V2: {
1246 const bool isV2 = operation.type == OperationType::PAD_V2;
1247 if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1248 return ANEURALNETWORKS_BAD_DATA;
1249 }
1250 const RunTimeOperandInfo& input = operands[ins[0]];
1251 const RunTimeOperandInfo& paddings = operands[ins[1]];
1252
1253 RunTimeOperandInfo& output = operands[outs[0]];
1254 Shape outShape = output.shape();
1255
1256 if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1257 paddings.shape(), &outShape) ||
1258 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1259 break;
1260 }
1261 if (input.type == OperandType::TENSOR_FLOAT32) {
1262 float pad_value = isV2 ? getScalarData<float>(operands[ins[2]]) : 0;
1263 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1264 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1265 reinterpret_cast<float*>(output.buffer), outShape);
1266 } else if (input.type == OperandType::TENSOR_FLOAT16) {
1267 _Float16 pad_value = isV2 ? getScalarData<_Float16>(operands[ins[2]]) : 0;
1268 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1269 reinterpret_cast<const int32_t*>(paddings.buffer),
1270 static_cast<_Float16>(pad_value),
1271 reinterpret_cast<_Float16*>(output.buffer), outShape);
1272 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1273 uint8_t pad_value =
1274 isV2 ? getScalarData<uint8_t>(operands[ins[2]]) : outShape.offset;
1275 success = padGeneric(input.buffer, input.shape(),
1276 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1277 output.buffer, outShape);
1278 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1279 uint8_t pad_value =
1280 isV2 ? getScalarData<int8_t>(operands[ins[2]]) : outShape.offset;
1281 success = padGeneric(input.buffer, input.shape(),
1282 reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1283 output.buffer, outShape);
1284 }
1285 } break;
1286 case OperationType::CAST: {
1287 if (!allParametersPresent(1, 1)) {
1288 return ANEURALNETWORKS_BAD_DATA;
1289 }
1290 const RunTimeOperandInfo& input = operands[ins[0]];
1291
1292 RunTimeOperandInfo& output = operands[outs[0]];
1293 Shape outShape = output.shape();
1294
1295 success = cast::prepare(input.shape(), &outShape) &&
1296 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1297 cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1298 } break;
1299 case OperationType::MEAN: {
1300 if (!allParametersPresent(3, 1)) {
1301 return ANEURALNETWORKS_BAD_DATA;
1302 }
1303 const RunTimeOperandInfo& input = operands[ins[0]];
1304 const RunTimeOperandInfo& axis = operands[ins[1]];
1305 int32_t keepDims = getScalarData<int32_t>(operands[ins[2]]);
1306
1307 RunTimeOperandInfo& output = operands[outs[0]];
1308 Shape outShape = output.shape();
1309
1310 if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1311 axis.shape(), keepDims > 0, &outShape) ||
1312 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1313 break;
1314 }
1315 if (input.type == OperandType::TENSOR_FLOAT16) {
1316 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1317 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1318 keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1319 outShape);
1320 } else if (input.type == OperandType::TENSOR_FLOAT32) {
1321 success = meanGeneric<float, float>(
1322 reinterpret_cast<float*>(input.buffer), input.shape(),
1323 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1324 reinterpret_cast<float*>(output.buffer), outShape);
1325 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1326 success = meanGeneric<uint8_t, int32_t>(
1327 reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1328 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1329 reinterpret_cast<uint8_t*>(output.buffer), outShape);
1330 } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1331 success = meanGeneric<int8_t, int32_t>(
1332 reinterpret_cast<int8_t*>(input.buffer), input.shape(),
1333 reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1334 reinterpret_cast<int8_t*>(output.buffer), outShape);
1335 }
1336 } break;
1337 case OperationType::ARGMAX:
1338 case OperationType::ARGMIN: {
1339 if (!allParametersPresent(2, 1)) {
1340 return ANEURALNETWORKS_BAD_DATA;
1341 }
1342 const RunTimeOperandInfo& input = operands[ins[0]];
1343 int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1344
1345 RunTimeOperandInfo& output = operands[outs[0]];
1346 Shape outShape = output.shape();
1347
1348 const bool isArgMin = operation.type == OperationType::ARGMIN;
1349 success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1350 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1351 argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1352 outShape);
1353 } break;
1354 case OperationType::EXPAND_DIMS: {
1355 if (!allParametersPresent(2, 1)) {
1356 return ANEURALNETWORKS_BAD_DATA;
1357 }
1358 const RunTimeOperandInfo& input = operands[ins[0]];
1359 int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1360
1361 RunTimeOperandInfo& output = operands[outs[0]];
1362 Shape outShape = output.shape();
1363
1364 success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1365 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1366 expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1367 } break;
1368 case OperationType::SPLIT: {
1369 const size_t outCount = outs.size();
1370 if (!allParametersPresent(3, outCount)) {
1371 return ANEURALNETWORKS_BAD_DATA;
1372 }
1373
1374 const RunTimeOperandInfo& input = operands[ins[0]];
1375 const int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1376 const int32_t numOutputs = getScalarData<int32_t>(operands[ins[2]]);
1377
1378 if (static_cast<size_t>(numOutputs) != outs.size()) {
1379 return ANEURALNETWORKS_BAD_DATA;
1380 }
1381
1382 std::vector<Shape> outputShapes(numOutputs);
1383 for (int i = 0; i < numOutputs; ++i) {
1384 outputShapes[i] = operands[outs[i]].shape();
1385 }
1386
1387 success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1388 for (int i = 0; i < numOutputs; ++i) {
1389 success = success && setInfoAndAllocateIfNeeded(&(operands[outs[i]]),
1390 outputShapes[i], &result);
1391 }
1392 switch (input.type) {
1393 case OperandType::TENSOR_FLOAT16: {
1394 std::vector<_Float16*> outputDataPtrs(numOutputs);
1395 for (int i = 0; i < numOutputs; ++i) {
1396 outputDataPtrs[i] = reinterpret_cast<_Float16*>(operands[outs[i]].buffer);
1397 }
1398 success = success &&
1399 splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1400 input.shape(), axis, &outputDataPtrs, outputShapes);
1401 } break;
1402 case OperandType::TENSOR_FLOAT32: {
1403 std::vector<float*> outputDataPtrs(numOutputs);
1404 for (int i = 0; i < numOutputs; ++i) {
1405 outputDataPtrs[i] = reinterpret_cast<float*>(operands[outs[i]].buffer);
1406 }
1407 success = success &&
1408 splitFloat32(reinterpret_cast<const float*>(input.buffer),
1409 input.shape(), axis, &outputDataPtrs, outputShapes);
1410 } break;
1411 case OperandType::TENSOR_INT32: {
1412 std::vector<int32_t*> outputDataPtrs(numOutputs);
1413 for (int i = 0; i < numOutputs; ++i) {
1414 outputDataPtrs[i] = reinterpret_cast<int32_t*>(operands[outs[i]].buffer);
1415 }
1416 success = success &&
1417 splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1418 input.shape(), axis, &outputDataPtrs, outputShapes);
1419 } break;
1420 case OperandType::TENSOR_QUANT8_ASYMM: {
1421 std::vector<uint8_t*> outputDataPtrs(numOutputs);
1422 for (int i = 0; i < numOutputs; ++i) {
1423 outputDataPtrs[i] = reinterpret_cast<uint8_t*>(operands[outs[i]].buffer);
1424 }
1425 success = success &&
1426 splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1427 input.shape(), axis, &outputDataPtrs, outputShapes);
1428 } break;
1429 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1430 std::vector<int8_t*> outputDataPtrs(numOutputs);
1431 for (int i = 0; i < numOutputs; ++i) {
1432 outputDataPtrs[i] = reinterpret_cast<int8_t*>(operands[outs[i]].buffer);
1433 }
1434 success = success &&
1435 splitQuant8Signed(reinterpret_cast<const int8_t*>(input.buffer),
1436 input.shape(), axis, &outputDataPtrs, outputShapes);
1437 } break;
1438 default: {
1439 return ANEURALNETWORKS_BAD_DATA;
1440 }
1441 }
1442 } break;
1443 case OperationType::MAXIMUM:
1444 case OperationType::MINIMUM: {
1445 if (!allParametersPresent(2, 1)) {
1446 return ANEURALNETWORKS_BAD_DATA;
1447 }
1448 const RunTimeOperandInfo& in1 = operands[ins[0]];
1449 const RunTimeOperandInfo& in2 = operands[ins[1]];
1450
1451 RunTimeOperandInfo& output = operands[outs[0]];
1452 Shape outputShape = output.shape();
1453
1454 const bool isMinimum = operation.type == OperationType::MINIMUM;
1455 success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1456 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1457 maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1458 isMinimum, output.buffer, outputShape);
1459 } break;
1460 case OperationType::GROUPED_CONV_2D: {
1461 const size_t inCount = ins.size();
1462 if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1463 return ANEURALNETWORKS_BAD_DATA;
1464 }
1465 const RunTimeOperandInfo& input = operands[ins[0]];
1466 const RunTimeOperandInfo& filter = operands[ins[1]];
1467 const RunTimeOperandInfo& bias = operands[ins[2]];
1468
1469 int32_t padding_left, padding_right;
1470 int32_t padding_top, padding_bottom;
1471 int32_t padding_implicit = 0;
1472 int32_t stride_width, stride_height;
1473 int32_t numGroups;
1474 int32_t activation;
1475 bool data_layout = false;
1476
1477 if (inCount == 12) {
1478 padding_left = getScalarData<int32_t>(operands[ins[3]]);
1479 padding_right = getScalarData<int32_t>(operands[ins[4]]);
1480 padding_top = getScalarData<int32_t>(operands[ins[5]]);
1481 padding_bottom = getScalarData<int32_t>(operands[ins[6]]);
1482 stride_width = getScalarData<int32_t>(operands[ins[7]]);
1483 stride_height = getScalarData<int32_t>(operands[ins[8]]);
1484 numGroups = getScalarData<int32_t>(operands[ins[9]]);
1485 activation = getScalarData<int32_t>(operands[ins[10]]);
1486 data_layout = getScalarData<bool>(operands[ins[11]]);
1487 } else {
1488 padding_implicit = getScalarData<int32_t>(operands[ins[3]]);
1489 stride_width = getScalarData<int32_t>(operands[ins[4]]);
1490 stride_height = getScalarData<int32_t>(operands[ins[5]]);
1491 numGroups = getScalarData<int32_t>(operands[ins[6]]);
1492 activation = getScalarData<int32_t>(operands[ins[7]]);
1493 data_layout = getScalarData<bool>(operands[ins[8]]);
1494 }
1495
1496 RunTimeOperandInfo& output = operands[outs[0]];
1497 Shape outShape = output.shape();
1498
1499 RunTimeOperandInfo input_tmp, output_tmp;
1500 std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1501 if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1502 success = false;
1503 break;
1504 }
1505 output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1506 output_tmp.buffer = data_layout ? nullptr : output.buffer;
1507 output_tmp.length = data_layout ? 0 : output.length;
1508
1509 if (inCount == 9) {
1510 Shape inputShape = input_tmp.shape();
1511 Shape filterShape = filter.shape();
1512 int32_t input_width = getSizeOfDimension(inputShape, 2);
1513 int32_t input_height = getSizeOfDimension(inputShape, 1);
1514 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1515 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1516 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1517 &padding_left, &padding_right);
1518 calculateExplicitPadding(input_height, stride_height, filter_height,
1519 padding_implicit, &padding_top, &padding_bottom);
1520 }
1521
1522 if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1523 padding_right, padding_top, padding_bottom, stride_width,
1524 stride_height, numGroups, &outShape) ||
1525 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1526 if (!data_layout) output.dimensions = output_tmp.dimensions;
1527 success = false;
1528 break;
1529 }
1530
1531 if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1532 success = groupedConvFloat32(
1533 reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1534 reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1535 reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1536 padding_right, padding_top, padding_bottom, stride_width, stride_height,
1537 numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1538 outShape);
1539 } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1540 success = groupedConvFloat16(
1541 reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1542 reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1543 reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1544 padding_right, padding_top, padding_bottom, stride_width, stride_height,
1545 numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1546 outShape);
1547 } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1548 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1549 success = groupedConvQuant8PerChannel(
1550 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1551 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1552 std::get<Operand::SymmPerChannelQuantParams>(filter.extraParams)
1553 .scales.data(),
1554 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1555 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1556 stride_height, numGroups, activation,
1557 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1558 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1559 success = groupedConvQuant8(
1560 reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1561 reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1562 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1563 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1564 stride_height, numGroups, activation,
1565 reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1566 }
1567 } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1568 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1569 success = groupedConvQuant8PerChannel(
1570 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1571 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1572 std::get<Operand::SymmPerChannelQuantParams>(filter.extraParams)
1573 .scales.data(),
1574 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1575 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1576 stride_height, numGroups, activation,
1577 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1578 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1579 success = groupedConvQuant8(
1580 reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1581 reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1582 reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1583 padding_left, padding_right, padding_top, padding_bottom, stride_width,
1584 stride_height, numGroups, activation,
1585 reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1586 }
1587 }
1588
1589 if (data_layout) {
1590 output_tmp_guard.reset(output_tmp.buffer);
1591 }
1592 if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1593 success = false;
1594 break;
1595 }
1596 } break;
1597 case OperationType::TILE: {
1598 if (!allParametersPresent(2, 1)) {
1599 return ANEURALNETWORKS_BAD_DATA;
1600 }
1601 const RunTimeOperandInfo& input = operands[ins[0]];
1602 const RunTimeOperandInfo& multiples = operands[ins[1]];
1603
1604 RunTimeOperandInfo& output = operands[outs[0]];
1605 Shape outShape = output.shape();
1606
1607 success =
1608 tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1609 multiples.shape(), &outShape) &&
1610 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1611 tile::eval(input.buffer, input.shape(),
1612 reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1613 outShape);
1614 } break;
1615 case OperationType::QUANTIZED_16BIT_LSTM: {
1616 if (!allParametersPresent(15, 2)) {
1617 return ANEURALNETWORKS_BAD_DATA;
1618 }
1619
1620 RunTimeOperandInfo& cellStateOut =
1621 operands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1622 RunTimeOperandInfo& output = operands[outs[QuantizedLSTMCell::kOutputTensor]];
1623
1624 Shape cellStateOutShape, outputShape;
1625 QuantizedLSTMCell quantizedLSTMCell(operation, operands);
1626
1627 success = QuantizedLSTMCell::prepare(operation, operands, &cellStateOutShape,
1628 &outputShape) &&
1629 setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1630 setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1631 quantizedLSTMCell.eval();
1632 } break;
1633 case OperationType::POW: {
1634 if (!allParametersPresent(2, 1)) {
1635 return ANEURALNETWORKS_BAD_DATA;
1636 }
1637 const RunTimeOperandInfo& base = operands[ins[0]];
1638 const RunTimeOperandInfo& exponent = operands[ins[1]];
1639
1640 RunTimeOperandInfo& output = operands[outs[0]];
1641 Shape outShape = output.shape();
1642
1643 success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1644 setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1645 pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1646 output.buffer, outShape);
1647 } break;
1648 default: {
1649 const OperationRegistration* operationRegistration =
1650 mOperationResolver->findOperation(operation.type);
1651 if (operationRegistration == nullptr) {
1652 LOG(ERROR) << operation.type << " not registered";
1653 } else if (operationRegistration->prepare == nullptr ||
1654 operationRegistration->execute == nullptr) {
1655 LOG(ERROR) << "Incomplete operation registration: " << operation.type;
1656 } else {
1657 OperationExecutionContext context(&operation, operands);
1658 success = operationRegistration->flags.allowOmittedOperand ||
1659 context.checkNoOmittedOperand();
1660 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1661 context.checkNoZeroSizedInput());
1662 success = success && operationRegistration->prepare(&context) &&
1663 operationRegistration->execute(&context);
1664 result = context.getResultCode();
1665 }
1666 }
1667 }
1668 if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1669 result = ANEURALNETWORKS_OP_FAILED;
1670 }
1671 if (result != ANEURALNETWORKS_NO_ERROR) {
1672 LOG(ERROR) << operation.type << " failed.";
1673 }
1674
1675 consumeOperationInputs(ins, operands);
1676 return result;
1677 #else
1678 LOG(ERROR) << "Built without CPU execution support";
1679 return ANEURALNETWORKS_OP_FAILED;
1680 #endif // NN_INCLUDE_CPU_IMPLEMENTATION
1681 }
1682
1683 // Copies RunTimeOperandInfo, preserving the original lifetime and numberOfUsesLeft
1684 // to prevent deallocation of subgraph inputs and outputs.
setInfoExceptLifetime(RunTimeOperandInfo * to,const RunTimeOperandInfo & from)1685 static void setInfoExceptLifetime(RunTimeOperandInfo* to, const RunTimeOperandInfo& from) {
1686 auto originalLifetime = to->lifetime;
1687 auto originalNumberOfUsesLeft = to->numberOfUsesLeft;
1688 *to = from;
1689 to->lifetime = originalLifetime;
1690 to->numberOfUsesLeft = originalNumberOfUsesLeft;
1691 }
1692
executeIfOperation(const Operation & operation,RunTimeOperandInfo * operands)1693 int CpuExecutor::executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1694 namespace op = operation_if;
1695 const RunTimeOperandInfo& condOperand = operands[operation.inputs[op::kCondBoolOperand]];
1696 if (condOperand.buffer == nullptr) {
1697 LOG(ERROR) << "Cannot read IF condition operand value";
1698 return ANEURALNETWORKS_OP_FAILED;
1699 }
1700 const bool condValue = *reinterpret_cast<const bool8*>(condOperand.buffer);
1701 VLOG(CPUEXE) << "CpuExecutor::executeIfOperation: condition value: " << condValue;
1702
1703 const uint32_t branchInputIndex = condValue ? op::kThenModelOperand : op::kElseModelOperand;
1704 const RunTimeOperandInfo& branchOperand = operands[operation.inputs[branchInputIndex]];
1705 const Model::Subgraph& branchSubgraph =
1706 *reinterpret_cast<const Model::Subgraph*>(branchOperand.buffer);
1707 std::vector<RunTimeOperandInfo> branchOperands = initializeRunTimeInfo(branchSubgraph);
1708
1709 // Initialize inner input and output operands from outer operands.
1710 for (uint32_t i = 0, n = branchSubgraph.inputIndexes.size(); i < n; ++i) {
1711 setInfoExceptLifetime(&branchOperands[branchSubgraph.inputIndexes[i]],
1712 operands[operation.inputs[op::kFirstInput + i]]);
1713 }
1714 for (uint32_t i = 0, n = branchSubgraph.outputIndexes.size(); i < n; ++i) {
1715 setInfoExceptLifetime(&branchOperands[branchSubgraph.outputIndexes[i]],
1716 operands[operation.outputs[i]]);
1717 }
1718
1719 NN_RETURN_IF_ERROR(executeSubgraph(branchSubgraph, branchOperands.data()));
1720 freeUnusedSubgraphOperands(&branchOperands);
1721
1722 // Update outer outputs.
1723 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1724 setInfoExceptLifetime(&operands[operation.outputs[i]],
1725 branchOperands[branchSubgraph.outputIndexes[i]]);
1726 }
1727
1728 consumeOperationInputs(operation.inputs, operands);
1729 return ANEURALNETWORKS_NO_ERROR;
1730 }
1731
executeWhileOperation(const Operation & operation,RunTimeOperandInfo * operands)1732 int CpuExecutor::executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1733 namespace op = operation_while;
1734 const RunTimeOperandInfo& condModelOperand = operands[operation.inputs[op::kCondModelOperand]];
1735 const RunTimeOperandInfo& bodyModelOperand = operands[operation.inputs[op::kBodyModelOperand]];
1736 const Model::Subgraph& condSubgraph =
1737 *reinterpret_cast<const Model::Subgraph*>(condModelOperand.buffer);
1738 const Model::Subgraph& bodySubgraph =
1739 *reinterpret_cast<const Model::Subgraph*>(bodyModelOperand.buffer);
1740 std::vector<RunTimeOperandInfo> condOperands = initializeRunTimeInfo(condSubgraph);
1741 std::vector<RunTimeOperandInfo> bodyOperands = initializeRunTimeInfo(bodySubgraph);
1742
1743 // The code below implements the following sequence of subgraph input and output buffer
1744 // assignments:
1745 // iteration = 0 cond inputs = body inputs = outer inputs body outputs = tmp1
1746 // iteration = 1 cond inputs = body inputs = tmp1 body outputs = tmp2
1747 // iteration = 2 cond inputs = body inputs = tmp2 body outputs = tmp1
1748 // iteration = 3 cond inputs = body inputs = ... body outputs = ...
1749
1750 // For body output double buffering.
1751 std::vector<uint8_t*> tmp1(bodySubgraph.outputIndexes.size());
1752 std::vector<uint8_t*> tmp2(bodySubgraph.outputIndexes.size());
1753
1754 // Ensure objects are freed
1755 auto cleanupGuard = base::make_scope_guard(
1756 [&tmp1, &tmp2, &condOperands, &bodyOperands, &operation, &operands] {
1757 auto freeLoopOutputs = [](const std::vector<uint8_t*>& tmp) {
1758 for (auto buffer : tmp) {
1759 if (buffer != nullptr) {
1760 delete[] buffer;
1761 }
1762 }
1763 };
1764
1765 freeLoopOutputs(tmp1);
1766 freeLoopOutputs(tmp2);
1767 freeUnusedSubgraphOperands(&condOperands);
1768 freeUnusedSubgraphOperands(&bodyOperands);
1769 consumeOperationInputs(operation.inputs, operands);
1770 });
1771
1772 // For body outputs with unknown shape, we skip double buffering and
1773 // allocate on each iteration instead. This allows growing output tensors
1774 // inside a WHILE loop.
1775 std::vector<bool> bodyOutputHasUnknownShape(bodySubgraph.outputIndexes.size());
1776 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1777 const Operand& operand = bodySubgraph.operands[bodySubgraph.outputIndexes[i]];
1778 bodyOutputHasUnknownShape[i] = nonExtensionOperandSizeOfData(operand) == 0;
1779 }
1780
1781 // Initialize condition inputs from outer operands.
1782 for (uint32_t i = 0, n = condSubgraph.inputIndexes.size(); i < n; ++i) {
1783 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1784 operands[operation.inputs[op::kFirstInput + i]]);
1785 }
1786
1787 // Store condition output on the stack.
1788 RunTimeOperandInfo& condOutput = condOperands[condSubgraph.outputIndexes[0]];
1789 bool8 condValue = {/* initialized memory */};
1790 condOutput.buffer = &condValue;
1791 condOutput.length = sizeof(condValue);
1792
1793 std::chrono::nanoseconds timeoutDuration(mLoopTimeoutDuration);
1794 const auto startTime = Clock::now();
1795 for (uint32_t iteration = 0;; ++iteration) {
1796 VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: iteration " << iteration;
1797 if (iteration != 0) {
1798 // Set condition inputs from previous iteration outputs.
1799 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1800 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1801 bodyOperands[bodySubgraph.outputIndexes[i]]);
1802 }
1803 }
1804 NN_RETURN_IF_ERROR(executeSubgraph(condSubgraph, condOperands.data()));
1805 VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: condition value: "
1806 << static_cast<int>(condValue);
1807 if (!condValue) {
1808 break;
1809 }
1810
1811 const auto duration = Clock::now() - startTime;
1812 if (duration > timeoutDuration) {
1813 LOG(ERROR) << "CpuExecutor::executeWhileOperation: timed out after "
1814 << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1815 << " ms";
1816 return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1817 }
1818
1819 // Set body inputs from condition inputs.
1820 for (uint32_t i = 0, n = bodySubgraph.inputIndexes.size(); i < n; ++i) {
1821 bodyOperands[bodySubgraph.inputIndexes[i]] = condOperands[condSubgraph.inputIndexes[i]];
1822 }
1823 // Set body outputs.
1824 auto& outputBuffer = iteration % 2 == 0 ? tmp1 : tmp2;
1825 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1826 RunTimeOperandInfo& info = bodyOperands[bodySubgraph.outputIndexes[i]];
1827 if (bodyOutputHasUnknownShape[i]) {
1828 // Reset dimensions and buffer.
1829 info.dimensions = bodySubgraph.operands[bodySubgraph.outputIndexes[i]].dimensions;
1830 if (outputBuffer[i] != nullptr) {
1831 delete[] outputBuffer[i];
1832 outputBuffer[i] = nullptr;
1833 }
1834 }
1835 info.buffer = outputBuffer[i];
1836 }
1837
1838 NN_RETURN_IF_ERROR(executeSubgraph(bodySubgraph, bodyOperands.data()));
1839
1840 // Update output buffer information in case we have allocated new buffers.
1841 for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1842 outputBuffer[i] = bodyOperands[bodySubgraph.outputIndexes[i]].buffer;
1843 }
1844 }
1845
1846 // Copy body outputs to outer outputs.
1847 for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1848 RunTimeOperandInfo& outerOperand = operands[operation.outputs[i]];
1849 RunTimeOperandInfo& innerOperand = condOperands[condSubgraph.inputIndexes[i]];
1850 if (int error; !setInfoAndAllocateIfNeeded(&outerOperand, innerOperand.shape(), &error)) {
1851 return error;
1852 }
1853 CHECK_EQ(outerOperand.length, innerOperand.length);
1854 // TODO: Use the outer buffer as tmp1 to avoid copies.
1855 std::memcpy(outerOperand.buffer, innerOperand.buffer, innerOperand.length);
1856 }
1857
1858 return ANEURALNETWORKS_NO_ERROR;
1859 }
1860
setOutputShapes(const std::vector<uint32_t> & outputIndexes,const std::vector<RunTimeOperandInfo> & operands)1861 void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
1862 const std::vector<RunTimeOperandInfo>& operands) {
1863 mOutputShapes.resize(outputIndexes.size());
1864 for (uint32_t i = 0; i < outputIndexes.size(); i++) {
1865 const uint32_t operandIndex = outputIndexes[i];
1866 const RunTimeOperandInfo& from = operands[operandIndex];
1867 mOutputShapes[i].dimensions = from.dimensions;
1868 mOutputShapes[i].isSufficient = from.isSufficient();
1869 VLOG(EXECUTION) << "CpuExecutor::setOutputShapes: mOutputShapes[" << i
1870 << "] = " << mOutputShapes[i];
1871 }
1872 }
1873
1874 // b/109953668, disable OpenMP
1875 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1876 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1877 mBlocktimeInitial = kmp_get_blocktime();
1878 kmp_set_blocktime(20); // ms, see b/109645291
1879
1880 #if NNAPI_LIMIT_CPU_THREADS
1881 // Code not yet enabled. Choosing the number of threads to be based on
1882 // benchmarking. See longer comment by the class declaration.
1883 mMaxThreadsInitial = Eigen::nbThreads();
1884 const int nProcs = omp_get_num_procs();
1885 int threads = nProcs;
1886 if (nProcs >= 8) {
1887 threads = nProcs - 4;
1888 } else if (nProcs >= 4) {
1889 threads = nProcs - 2;
1890 }
1891 Eigen::setNbThreads(threads);
1892 #endif
1893 }
1894
~ScopedOpenmpSettings()1895 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1896 kmp_set_blocktime(mBlocktimeInitial);
1897 #if NNAPI_LIMIT_CPU_THREADS
1898 Eigen::setNbThreads(mMaxThreadsInitial);
1899 #endif
1900 }
1901 #endif // NNAPI_OPENMP
1902
1903 } // namespace nn
1904 } // namespace android
1905