• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "CpuExecutor"
18 
19 #include "CpuExecutor.h"
20 
21 #include <android-base/scopeguard.h>
22 #include <nnapi/SharedMemory.h>
23 #include <nnapi/TypeUtils.h>
24 
25 #include <limits>
26 #include <memory>
27 #include <utility>
28 #include <vector>
29 
30 #include "ControlFlow.h"
31 #include "NeuralNetworks.h"
32 #include "OperationResolver.h"
33 #include "Operations.h"
34 #include "OperationsUtils.h"
35 #include "Tracing.h"
36 
37 // b/109953668, disable OpenMP
38 #ifdef NNAPI_OPENMP
39 #include <omp.h>
40 
41 #include <Eigen/Core>
42 #endif  // NNAPI_OPENMP
43 
44 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
45 #include "operations/BidirectionalSequenceLSTM.h"
46 #include "operations/Cast.h"
47 #include "operations/EmbeddingLookup.h"
48 #include "operations/ExpandDims.h"
49 #include "operations/HashtableLookup.h"
50 #include "operations/LSHProjection.h"
51 #include "operations/LSTM.h"
52 #include "operations/MaximumMinimum.h"
53 #include "operations/Multinomial.h"
54 #include "operations/Pow.h"
55 #include "operations/QuantizedLSTM.h"
56 #include "operations/RNN.h"
57 #include "operations/SVDF.h"
58 #include "operations/Tile.h"
59 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
60 
61 namespace android {
62 namespace nn {
63 namespace {
64 
65 class OperationExecutionContext : public IOperationExecutionContext {
66     DISALLOW_IMPLICIT_CONSTRUCTORS(OperationExecutionContext);
67 
68    public:
OperationExecutionContext(const Operation * operation,RunTimeOperandInfo * operands)69     OperationExecutionContext(const Operation* operation, RunTimeOperandInfo* operands)
70         : operation(operation), operands(operands) {}
71 
72     uint32_t getNumInputs() const override;
73     OperandType getInputType(uint32_t index) const override;
74     Shape getInputShape(uint32_t index) const override;
75     const void* getInputBuffer(uint32_t index) const override;
76     const Operand::ExtraParams& getInputExtraParams(uint32_t index) const override;
77 
78     uint32_t getNumOutputs() const override;
79     OperandType getOutputType(uint32_t index) const override;
80     Shape getOutputShape(uint32_t index) const override;
81     void* getOutputBuffer(uint32_t index) override;
82 
83     // Return false on failure and store the result code.
84     // Use getResultCode() to retrieve it at the end of the operation execution.
85     bool setOutputShape(uint32_t index, const Shape& shape) override;
86     int getResultCode() const;
87 
88     bool isOmittedInput(uint32_t index) const override;
89     bool isOmittedOutput(uint32_t index) const override;
90 
91     // Return false if any of inputs or outputs is omitted, i.e. has lifetime of NO_VALUE.
92     bool checkNoOmittedOperand() const;
93     // Return false if any of inputs has dimension 0.
94     bool checkNoZeroSizedInput() const;
95 
96    private:
97     const RunTimeOperandInfo* getInputInfo(uint32_t index) const;
98     const RunTimeOperandInfo* getOutputInfo(uint32_t index) const;
99     RunTimeOperandInfo* getOutputInfo(uint32_t index);
100 
101     const Operation* operation;
102     RunTimeOperandInfo* operands;
103 
104     int result = ANEURALNETWORKS_NO_ERROR;
105 };
106 
getInputInfo(uint32_t index) const107 const RunTimeOperandInfo* OperationExecutionContext::getInputInfo(uint32_t index) const {
108     CHECK(index < operation->inputs.size());
109     return &operands[operation->inputs[index]];
110 }
111 
getOutputInfo(uint32_t index) const112 const RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) const {
113     CHECK(index < operation->outputs.size());
114     return &operands[operation->outputs[index]];
115 }
116 
getOutputInfo(uint32_t index)117 RunTimeOperandInfo* OperationExecutionContext::getOutputInfo(uint32_t index) {
118     CHECK(index < operation->outputs.size());
119     return &operands[operation->outputs[index]];
120 }
121 
getInputType(uint32_t index) const122 OperandType OperationExecutionContext::getInputType(uint32_t index) const {
123     return getInputInfo(index)->type;
124 }
125 
getInputShape(uint32_t index) const126 Shape OperationExecutionContext::getInputShape(uint32_t index) const {
127     return getInputInfo(index)->shape();
128 }
129 
getInputBuffer(uint32_t index) const130 const void* OperationExecutionContext::getInputBuffer(uint32_t index) const {
131     return getInputInfo(index)->buffer;
132 }
133 
getInputExtraParams(uint32_t index) const134 const Operand::ExtraParams& OperationExecutionContext::getInputExtraParams(uint32_t index) const {
135     return getInputInfo(index)->extraParams;
136 }
137 
getOutputType(uint32_t index) const138 OperandType OperationExecutionContext::getOutputType(uint32_t index) const {
139     return getOutputInfo(index)->type;
140 }
141 
getOutputShape(uint32_t index) const142 Shape OperationExecutionContext::getOutputShape(uint32_t index) const {
143     return getOutputInfo(index)->shape();
144 }
145 
getOutputBuffer(uint32_t index)146 void* OperationExecutionContext::getOutputBuffer(uint32_t index) {
147     return getOutputInfo(index)->buffer;
148 }
149 
getNumInputs() const150 uint32_t OperationExecutionContext::getNumInputs() const {
151     return operation->inputs.size();
152 }
153 
getNumOutputs() const154 uint32_t OperationExecutionContext::getNumOutputs() const {
155     return operation->outputs.size();
156 }
157 
getResultCode() const158 int OperationExecutionContext::getResultCode() const {
159     return result;
160 }
161 
162 // TODO: Return error code directly once we've fully integrated OperationResolver with all ops.
163 // Updates the RunTimeOperandInfo with the newly calculated shape.
164 // Allocate the buffer if we need to.
165 //
166 // TODO(b/153081229): This function currently cannot handle extension operands well. We need to
167 //                    propagate the extension type info into this function.
setInfoAndAllocateIfNeeded(RunTimeOperandInfo * info,const Shape & shape,int * result)168 bool setInfoAndAllocateIfNeeded(RunTimeOperandInfo* info, const Shape& shape, int* result) {
169     // For user-provided model output operands, the parameters must match the Shape
170     // calculated from the preparation step.
171     if (info->lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT) {
172         if (info->type != shape.type) {
173             LOG(ERROR) << "Invalid type for model output";
174             *result = ANEURALNETWORKS_OP_FAILED;
175             return false;
176         }
177         if (info->scale != shape.scale) {
178             LOG(ERROR) << "Invalid scale for model output";
179             *result = ANEURALNETWORKS_OP_FAILED;
180             return false;
181         }
182         if (info->zeroPoint != shape.offset) {
183             LOG(ERROR) << "Invalid zeroPoint for model output";
184             *result = ANEURALNETWORKS_OP_FAILED;
185             return false;
186         }
187         if (info->extraParams != shape.extraParams) {
188             LOG(ERROR) << "Invalid extraParams for model output";
189             *result = ANEURALNETWORKS_OP_FAILED;
190             return false;
191         }
192     }
193 
194     auto combined = combineDimensions(shape.dimensions, info->dimensions);
195     if (!combined.has_value()) {
196         LOG(ERROR) << "Invalid dimensions for model operand: " << combined.error();
197         *result = ANEURALNETWORKS_OP_FAILED;
198         return false;
199     }
200     info->dimensions = std::move(combined.value());
201     info->type = shape.type;
202     info->scale = shape.scale;
203     info->zeroPoint = shape.offset;
204     info->extraParams = shape.extraParams;
205 
206     // TODO(b/153081229): We bypass the overflow check on extension operands because we do not know
207     //                    the sizes of extension types.
208     if (!isExtension(info->type) &&
209         nonExtensionOperandSizeOfDataOverflowsUInt32(info->type, info->dimensions)) {
210         LOG(ERROR) << "Operand data size overflows uint32_t";
211         *result = ANEURALNETWORKS_OP_FAILED;
212         return false;
213     }
214 
215     // Allocate the buffer only if the combined dimension is fully specified
216     if (info->buffer == nullptr && (info->lifetime == Operand::LifeTime::TEMPORARY_VARIABLE ||
217                                     info->lifetime == Operand::LifeTime::SUBGRAPH_OUTPUT)) {
218         if (isExtension(info->type)) {
219             LOG(ERROR) << "Cannot allocate a variable of an extension type";
220             *result = ANEURALNETWORKS_OP_FAILED;
221             return false;
222         }
223         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
224         if (length > 0) {
225             info->buffer = new uint8_t[length];
226             if (info->buffer == nullptr) {
227                 *result = ANEURALNETWORKS_OUT_OF_MEMORY;
228                 return false;
229             }
230             info->length = length;
231         }
232     }
233     if (!info->isSufficient()) {
234         uint32_t length = nonExtensionOperandSizeOfData(info->type, info->dimensions);
235         LOG(ERROR) << "Insufficient size for model operand: require = " << length
236                    << ", provided = " << info->length;
237         *result = ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE;
238         return false;
239     }
240     *result = ANEURALNETWORKS_NO_ERROR;
241     return true;
242 }
243 
setOutputShape(uint32_t index,const Shape & shape)244 bool OperationExecutionContext::setOutputShape(uint32_t index, const Shape& shape) {
245     return setInfoAndAllocateIfNeeded(getOutputInfo(index), shape, &result);
246 }
247 
isOmittedInput(uint32_t index) const248 bool OperationExecutionContext::isOmittedInput(uint32_t index) const {
249     return getInputInfo(index)->lifetime == Operand::LifeTime::NO_VALUE;
250 }
251 
isOmittedOutput(uint32_t index) const252 bool OperationExecutionContext::isOmittedOutput(uint32_t index) const {
253     return getOutputInfo(index)->lifetime == Operand::LifeTime::NO_VALUE;
254 }
255 
checkNoOmittedOperand() const256 bool OperationExecutionContext::checkNoOmittedOperand() const {
257     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
258         NN_RET_CHECK(!isOmittedInput(i))
259                 << operation->type << " input operand " << i << " is required but missing.";
260     }
261     for (uint32_t i = 0; i < operation->outputs.size(); i++) {
262         NN_RET_CHECK(!isOmittedOutput(i))
263                 << operation->type << " output operand " << i << " is required but missing.";
264     }
265     return true;
266 }
267 
checkNoZeroSizedInput() const268 bool OperationExecutionContext::checkNoZeroSizedInput() const {
269     for (uint32_t i = 0; i < operation->inputs.size(); i++) {
270         if (isOmittedInput(i)) continue;
271         for (uint32_t j = 0; j < getInputInfo(i)->dimensions.size(); j++) {
272             NN_RET_CHECK_NE(getInputInfo(i)->dimensions[j], 0)
273                     << operation->type << " does not support zero-sized tensor, but input " << i
274                     << " dimension " << j << " is 0.";
275         }
276     }
277     return true;
278 }
279 
280 }  // namespace
281 
282 // Used to keep a pointer to a memory pool.
283 //
284 // In the case of an "mmap_fd" pool, owns the mmap region
285 // returned by getBuffer() -- i.e., that region goes away
286 // when the RunTimePoolInfo is destroyed or is assigned to.
287 class RunTimePoolInfo::RunTimePoolInfoImpl {
288    public:
289     RunTimePoolInfoImpl(SharedMemory memory, Mapping mapping);
290 
291     uint8_t* getBuffer() const;
292     uint32_t getSize() const;
293 
294     bool flush() const;
295 
getMemory() const296     const SharedMemory& getMemory() const { return mMemory; }
297 
298    private:
299     const SharedMemory mMemory;
300     const Mapping mMapping;
301 };
302 
RunTimePoolInfoImpl(SharedMemory memory,Mapping mapping)303 RunTimePoolInfo::RunTimePoolInfoImpl::RunTimePoolInfoImpl(SharedMemory memory, Mapping mapping)
304     : mMemory(std::move(memory)), mMapping(std::move(mapping)) {}
305 
getBuffer() const306 uint8_t* RunTimePoolInfo::RunTimePoolInfoImpl::getBuffer() const {
307     return std::visit(
308             [](auto* pointer) {
309                 // Writing to a const buffer may lead to undefined behavior.
310                 // TODO: Refactor the code to avoid the const_cast.
311                 return static_cast<uint8_t*>(const_cast<void*>(pointer));
312             },
313             mMapping.pointer);
314 }
315 
getSize() const316 uint32_t RunTimePoolInfo::RunTimePoolInfoImpl::getSize() const {
317     CHECK_LE(mMapping.size, std::numeric_limits<uint32_t>::max());
318     return static_cast<uint32_t>(mMapping.size);
319 }
320 
321 // Making sure the output data are correctly updated after execution.
flush() const322 bool RunTimePoolInfo::RunTimePoolInfoImpl::flush() const {
323     return nn::flush(mMapping);
324 }
325 
326 // TODO: short term, make share memory mapping and updating a utility function.
327 // TODO: long term, implement mmap_fd as a hidl IMemory service.
createFromMemory(const SharedMemory & memory)328 std::optional<RunTimePoolInfo> RunTimePoolInfo::createFromMemory(const SharedMemory& memory) {
329     auto mapping = map(memory);
330     if (!mapping.has_value()) {
331         LOG(ERROR) << "Can't map shared memory: " << mapping.error().message;
332         return std::nullopt;
333     }
334     const auto impl =
335             std::make_shared<const RunTimePoolInfoImpl>(memory, std::move(mapping).value());
336     return RunTimePoolInfo(impl);
337 }
338 
createFromExistingBuffer(uint8_t * buffer,uint32_t size)339 RunTimePoolInfo RunTimePoolInfo::createFromExistingBuffer(uint8_t* buffer, uint32_t size) {
340     auto mapping = Mapping{.pointer = buffer, .size = size};
341     const auto impl = std::make_shared<const RunTimePoolInfoImpl>(std::make_shared<const Memory>(),
342                                                                   std::move(mapping));
343     return RunTimePoolInfo(impl);
344 }
345 
RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl> & impl)346 RunTimePoolInfo::RunTimePoolInfo(const std::shared_ptr<const RunTimePoolInfoImpl>& impl)
347     : mImpl(impl) {}
348 
getBuffer() const349 uint8_t* RunTimePoolInfo::getBuffer() const {
350     return mImpl->getBuffer();
351 }
352 
getSize() const353 uint32_t RunTimePoolInfo::getSize() const {
354     return mImpl->getSize();
355 }
356 
flush() const357 bool RunTimePoolInfo::flush() const {
358     return mImpl->flush();
359 }
360 
getMemory() const361 const SharedMemory& RunTimePoolInfo::getMemory() const {
362     return mImpl->getMemory();
363 }
364 
setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo> * poolInfos,const std::vector<SharedMemory> & pools)365 bool setRunTimePoolInfosFromCanonicalMemories(std::vector<RunTimePoolInfo>* poolInfos,
366                                               const std::vector<SharedMemory>& pools) {
367     CHECK(poolInfos != nullptr);
368     poolInfos->clear();
369     poolInfos->reserve(pools.size());
370     for (const auto& pool : pools) {
371         if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromMemory(pool)) {
372             poolInfos->push_back(*poolInfo);
373         } else {
374             LOG(ERROR) << "Could not map pools";
375             poolInfos->clear();
376             return false;
377         }
378     }
379     return true;
380 }
381 
setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo> * poolInfos,const std::vector<Request::MemoryPool> & pools)382 bool setRunTimePoolInfosFromMemoryPools(std::vector<RunTimePoolInfo>* poolInfos,
383                                         const std::vector<Request::MemoryPool>& pools) {
384     CHECK(poolInfos != nullptr);
385     poolInfos->clear();
386     poolInfos->reserve(pools.size());
387     for (const auto& pool : pools) {
388         if (!std::holds_alternative<SharedMemory>(pool)) {
389             LOG(ERROR) << "Unknown memory token";
390             poolInfos->clear();
391             return false;
392         }
393         if (std::optional<RunTimePoolInfo> poolInfo =
394                     RunTimePoolInfo::createFromMemory(std::get<SharedMemory>(pool))) {
395             poolInfos->push_back(*poolInfo);
396         } else {
397             LOG(ERROR) << "Could not map pools";
398             poolInfos->clear();
399             return false;
400         }
401     }
402     return true;
403 }
404 
405 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
406 template <typename T>
convertToNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)407 inline bool convertToNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
408     uint32_t spatialSize = fromDim[2] * fromDim[3];
409     for (uint32_t n = 0; n < fromDim[0]; n++) {
410         for (uint32_t hw = 0; hw < spatialSize; hw++) {
411             for (uint32_t c = 0; c < fromDim[1]; c++) {
412                 uint32_t fromIndex = n * fromDim[1] * spatialSize + c * spatialSize + hw;
413                 *to++ = from[fromIndex];
414             }
415         }
416     }
417     return true;
418 }
419 
420 template <typename T>
convertFromNhwcImpl(T * to,const T * from,const std::vector<uint32_t> & fromDim)421 inline bool convertFromNhwcImpl(T* to, const T* from, const std::vector<uint32_t>& fromDim) {
422     uint32_t spatialSize = fromDim[1] * fromDim[2];
423     for (uint32_t n = 0; n < fromDim[0]; n++) {
424         for (uint32_t c = 0; c < fromDim[3]; c++) {
425             for (uint32_t hw = 0; hw < spatialSize; hw++) {
426                 uint32_t fromIndex = n * spatialSize * fromDim[3] + hw * fromDim[3] + c;
427                 *to++ = from[fromIndex];
428             }
429         }
430     }
431     return true;
432 }
433 
convertToNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,std::unique_ptr<uint8_t[]> & ptr_guard,bool data_layout)434 static bool convertToNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
435                           std::unique_ptr<uint8_t[]>& ptr_guard, bool data_layout) {
436     int result;
437     if (from.dimensions.size() != 4) {
438         LOG(ERROR) << "Error converting a non-4-D tensor to NHWC layout";
439         return false;
440     }
441     to.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
442     if (data_layout) {
443         // convert dimensions
444         Shape inShape = from.shape();
445         auto& fromDim = from.dimensions;
446         inShape.dimensions = {fromDim[0], fromDim[2], fromDim[3], fromDim[1]};
447         // allocate buffer
448         to.buffer = nullptr;
449         if (!setInfoAndAllocateIfNeeded(&to, inShape, &result)) {
450             return false;
451         }
452         ptr_guard.reset(to.buffer);
453         // convert value
454         if (from.type == OperandType::TENSOR_FLOAT32) {
455             return convertToNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
456                                             reinterpret_cast<const float*>(from.buffer), fromDim);
457         } else if (from.type == OperandType::TENSOR_FLOAT16) {
458             return convertToNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
459                                                reinterpret_cast<const _Float16*>(from.buffer),
460                                                fromDim);
461         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
462             return convertToNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
463                                               reinterpret_cast<const uint8_t*>(from.buffer),
464                                               fromDim);
465         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
466             return convertToNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
467                                              reinterpret_cast<const int8_t*>(from.buffer), fromDim);
468         } else {
469             LOG(ERROR) << "Unsupported data type";
470             return false;
471         }
472     } else {
473         to = from;
474     }
475     return true;
476 }
477 
convertFromNhwc(RunTimeOperandInfo & to,const RunTimeOperandInfo & from,bool data_layout,int * result)478 static bool convertFromNhwc(RunTimeOperandInfo& to, const RunTimeOperandInfo& from,
479                             bool data_layout, int* result) {
480     if (from.dimensions.size() != 4) {
481         LOG(ERROR) << "Error converting a non-4-D tensor from NHWC layout";
482         return false;
483     }
484     if (data_layout) {
485         // convert dimensions
486         Shape outShape = from.shape();
487         auto& fromDim = from.dimensions;
488         outShape.dimensions = {fromDim[0], fromDim[3], fromDim[1], fromDim[2]};
489         // allocate buffer
490         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
491             return false;
492         }
493         // convert value
494         if (from.type == OperandType::TENSOR_FLOAT32) {
495             return convertFromNhwcImpl<float>(reinterpret_cast<float*>(to.buffer),
496                                               reinterpret_cast<const float*>(from.buffer), fromDim);
497         } else if (from.type == OperandType::TENSOR_FLOAT16) {
498             return convertFromNhwcImpl<_Float16>(reinterpret_cast<_Float16*>(to.buffer),
499                                                  reinterpret_cast<const _Float16*>(from.buffer),
500                                                  fromDim);
501         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM) {
502             return convertFromNhwcImpl<uint8_t>(reinterpret_cast<uint8_t*>(to.buffer),
503                                                 reinterpret_cast<const uint8_t*>(from.buffer),
504                                                 fromDim);
505         } else if (from.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
506             return convertFromNhwcImpl<int8_t>(reinterpret_cast<int8_t*>(to.buffer),
507                                                reinterpret_cast<const int8_t*>(from.buffer),
508                                                fromDim);
509         } else {
510             LOG(ERROR) << "Unsupported data type";
511             return false;
512         }
513     } else {
514         Shape outShape = from.shape();
515         to.buffer = from.buffer;
516         to.length = from.length;
517         if (!setInfoAndAllocateIfNeeded(&to, outShape, result)) {
518             return false;
519         }
520     }
521     return true;
522 }
523 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
524 
525 // Decrements the usage count for the operands listed.  Frees the memory
526 // allocated for any temporary variable with a count of zero.
consumeOperationInputs(const std::vector<uint32_t> & inputs,RunTimeOperandInfo * operands)527 static void consumeOperationInputs(const std::vector<uint32_t>& inputs,
528                                    RunTimeOperandInfo* operands) {
529     for (uint32_t i : inputs) {
530         auto& info = operands[i];
531         // Check if it's a static or model input/output.
532         if (info.numberOfUsesLeft == 0) {
533             continue;
534         }
535         info.numberOfUsesLeft--;
536         if (info.numberOfUsesLeft == 0 && info.buffer != nullptr) {
537             delete[] info.buffer;
538             info.buffer = nullptr;
539         }
540     }
541 }
542 
543 // This function only frees TEMPORARY_VARIABLE operands that are unused
544 // outputs because consumeOperationInputs takes care of any operands
545 // that are inputs to an operation.
freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo> * operands)546 static void freeUnusedSubgraphOperands(std::vector<RunTimeOperandInfo>* operands) {
547     for (auto& info : *operands) {
548         if (info.lifetime == Operand::LifeTime::TEMPORARY_VARIABLE && info.numberOfUsesLeft == 0 &&
549             info.buffer != nullptr) {
550             delete[] info.buffer;
551             info.buffer = nullptr;
552         }
553     }
554 }
555 
556 // Ignore the .pools entry in model and request.  This will have been taken care of
557 // by the caller.
run(const Model & model,const Request & request,const std::vector<RunTimePoolInfo> & modelPoolInfos,const std::vector<RunTimePoolInfo> & requestPoolInfos)558 int CpuExecutor::run(const Model& model, const Request& request,
559                      const std::vector<RunTimePoolInfo>& modelPoolInfos,
560                      const std::vector<RunTimePoolInfo>& requestPoolInfos) {
561     NNTRACE_CPU(NNTRACE_PHASE_EXECUTION, "run");
562     VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(request) << ")";
563     mModelOperandValues = model.operandValues.data();
564     mModelPoolInfos = &modelPoolInfos;
565     mReferencedSubgraphs = &model.referenced;
566 
567     // b/109953668, disable OpenMP
568 #ifdef NNAPI_OPENMP
569     ScopedOpenmpSettings openMpSettings;
570 #endif  // NNAPI_OPENMP
571 
572     std::vector<RunTimeOperandInfo> operands = initializeRunTimeInfo(model.main);
573     updateForArguments(model.main.inputIndexes, request.inputs, requestPoolInfos, operands.data());
574     updateForArguments(model.main.outputIndexes, request.outputs, requestPoolInfos,
575                        operands.data());
576     int result = executeSubgraph(model.main, operands.data());
577     freeUnusedSubgraphOperands(&operands);
578 
579     if (result == ANEURALNETWORKS_NO_ERROR) {
580         VLOG(CPUEXE) << "Completed run normally";
581         for (auto& runtimeInfo : requestPoolInfos) {
582             runtimeInfo.flush();
583         }
584     }
585 
586     // Only report the output shapes when the result code is NO_ERROR or OUTPUT_INSUFFICIENT_SIZE.
587     if (result == ANEURALNETWORKS_NO_ERROR || result == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
588         setOutputShapes(model.main.outputIndexes, operands);
589     } else {
590         mOutputShapes.clear();
591     }
592 
593     mFinished = true;
594     mModelOperandValues = nullptr;
595     mModelPoolInfos = nullptr;
596     mReferencedSubgraphs = nullptr;
597     return result;
598 }
599 
executeSubgraph(const Model::Subgraph & subgraph,RunTimeOperandInfo * operands)600 int CpuExecutor::executeSubgraph(const Model::Subgraph& subgraph, RunTimeOperandInfo* operands) {
601     VLOG(CPUEXE) << "CpuExecutor::executeSubgraph " << subgraph;
602     // The graph has serialized the operation in execution order.
603     for (const auto& operation : subgraph.operations) {
604         NN_RETURN_IF_ERROR(executeOperation(operation, operands));
605     }
606     return ANEURALNETWORKS_NO_ERROR;
607 }
608 
initializeRunTimeInfo(const Model::Subgraph & subgraph)609 std::vector<RunTimeOperandInfo> CpuExecutor::initializeRunTimeInfo(
610         const Model::Subgraph& subgraph) {
611     VLOG(CPUEXE) << "CpuExecutor::initializeRunTimeInfo";
612     const size_t count = subgraph.operands.size();
613     std::vector<RunTimeOperandInfo> operands(count);
614     std::vector<uint32_t> numberOfConsumers =
615             countNumberOfConsumers(count, subgraph.operations).value();
616     for (size_t i = 0; i < count; i++) {
617         const Operand& from = subgraph.operands[i];
618         RunTimeOperandInfo& to = operands[i];
619         to.type = from.type;
620         to.dimensions = from.dimensions;
621         to.scale = from.scale;
622         to.zeroPoint = from.zeroPoint;
623         to.length = from.location.length;
624         to.lifetime = from.lifetime;
625         to.extraParams = from.extraParams;
626         switch (from.lifetime) {
627             case Operand::LifeTime::TEMPORARY_VARIABLE:
628                 to.buffer = nullptr;
629                 to.numberOfUsesLeft = numberOfConsumers[i];
630                 break;
631             case Operand::LifeTime::CONSTANT_COPY:
632                 to.buffer = const_cast<uint8_t*>(mModelOperandValues + from.location.offset);
633                 to.numberOfUsesLeft = 0;
634                 break;
635             case Operand::LifeTime::CONSTANT_REFERENCE: {
636                 auto poolIndex = from.location.poolIndex;
637                 CHECK_LT(poolIndex, mModelPoolInfos->size());
638                 auto& r = (*mModelPoolInfos)[poolIndex];
639                 to.buffer = r.getBuffer() + from.location.offset;
640                 to.numberOfUsesLeft = 0;
641                 break;
642             }
643             case Operand::LifeTime::SUBGRAPH: {
644                 auto subgraphIndex = from.location.offset;
645                 CHECK_LT(subgraphIndex, mReferencedSubgraphs->size());
646                 to.buffer = reinterpret_cast<uint8_t*>(
647                         const_cast<Model::Subgraph*>(&(*mReferencedSubgraphs)[subgraphIndex]));
648                 to.numberOfUsesLeft = 0;
649             } break;
650             case Operand::LifeTime::POINTER: {
651                 to.buffer = reinterpret_cast<uint8_t*>(
652                         const_cast<void*>(std::get<const void*>(from.location.pointer)));
653                 to.numberOfUsesLeft = 0;
654             } break;
655             case Operand::LifeTime::SUBGRAPH_INPUT:
656             case Operand::LifeTime::SUBGRAPH_OUTPUT:
657             case Operand::LifeTime::NO_VALUE:
658                 to.buffer = nullptr;
659                 to.numberOfUsesLeft = 0;
660                 break;
661         }
662     }
663     return operands;
664 }
665 
updateForArguments(const std::vector<uint32_t> & indexes,const std::vector<Request::Argument> & arguments,const std::vector<RunTimePoolInfo> & requestPoolInfos,RunTimeOperandInfo * operands)666 void CpuExecutor::updateForArguments(const std::vector<uint32_t>& indexes,
667                                      const std::vector<Request::Argument>& arguments,
668                                      const std::vector<RunTimePoolInfo>& requestPoolInfos,
669                                      RunTimeOperandInfo* operands) {
670     CHECK_EQ(indexes.size(), arguments.size());
671     for (size_t i = 0; i < indexes.size(); i++) {
672         const uint32_t operandIndex = indexes[i];
673         const Request::Argument& from = arguments[i];
674         RunTimeOperandInfo& to = operands[operandIndex];
675         if (!from.dimensions.empty()) {
676             // It's the responsibility of the caller to validate that
677             // from.dimensions only modifies the dimensions that were
678             // unspecified in the model.  That's the case in SampleDriver.cpp
679             // with the call to validateRequest().
680             // TODO make sure that's the case for the default CPU path.
681             to.dimensions = from.dimensions;
682         }
683         switch (from.lifetime) {
684             case Request::Argument::LifeTime::NO_VALUE: {
685                 to.lifetime = Operand::LifeTime::NO_VALUE;
686                 CHECK(to.buffer == nullptr);
687                 to.length = 0;
688                 break;
689             }
690             case Request::Argument::LifeTime::POOL: {
691                 auto poolIndex = from.location.poolIndex;
692                 CHECK_LT(poolIndex, requestPoolInfos.size());
693                 auto& r = requestPoolInfos[poolIndex];
694                 to.buffer = r.getBuffer() + from.location.offset;
695                 if (from.location.offset == 0 && from.location.length == 0) {
696                     // Use the entire memory region.
697                     to.length = r.getSize();
698                 } else {
699                     to.length = from.location.length;
700                 }
701                 break;
702             }
703             case Request::Argument::LifeTime::POINTER: {
704                 constexpr auto fn = [](const void* ptr) {
705                     return static_cast<const uint8_t*>(ptr);
706                 };
707                 auto ptr = std::visit(fn, from.location.pointer);
708                 // Writing to a const buffer may lead to undefined behavior.
709                 // TODO: Refactor the code to avoid the const_cast.
710                 to.buffer = const_cast<uint8_t*>(ptr);
711                 to.length = from.location.length;
712                 break;
713             }
714         }
715     }
716 }
717 
executeOperation(const Operation & operation,RunTimeOperandInfo * operands)718 int CpuExecutor::executeOperation(const Operation& operation, RunTimeOperandInfo* operands) {
719 #ifdef NN_INCLUDE_CPU_IMPLEMENTATION
720     if (hasDeadlinePassed(mDeadline)) {
721         return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
722     }
723     if (operation.type == OperationType::IF) {
724         int result = executeIfOperation(operation, operands);
725         if (result != ANEURALNETWORKS_NO_ERROR) {
726             LOG(ERROR) << "IF failed.";
727         }
728         return result;
729     }
730     if (operation.type == OperationType::WHILE) {
731         int result = executeWhileOperation(operation, operands);
732         if (result != ANEURALNETWORKS_NO_ERROR) {
733             LOG(ERROR) << "WHILE failed.";
734         }
735         return result;
736     }
737 
738     // VLOG(CPUEXE) << "CpuExecutor::executeOperation(" << operation << ")";
739     const std::vector<uint32_t>& ins = operation.inputs;
740     const std::vector<uint32_t>& outs = operation.outputs;
741     bool success = false;
742     int result = ANEURALNETWORKS_NO_ERROR;
743 
744     // Function to verify that the number of input and output parameters
745     // matches what is expected.  Also checks that all the parameters have
746     // values. This function is to be used only for operations that do not
747     // accept optional arguments.
748     // TODO Have a version that works for optional arguments.
749     auto allParametersPresent = [&operation, &operands, &ins, &outs](size_t requiredIns,
750                                                                      size_t requiredOuts) -> bool {
751         auto verify = [&operation, &operands](size_t requiredCount,
752                                               const std::vector<uint32_t>& indexes,
753                                               const char* type) -> bool {
754             size_t actualCount = indexes.size();
755             if (actualCount != requiredCount) {
756                 LOG(ERROR) << operation.type << ": Invalid number of " << type << " operands. Got "
757                            << actualCount << " of " << requiredCount;
758                 return false;
759             }
760             for (size_t i = 0; i < actualCount; i++) {
761                 if (operands[indexes[i]].lifetime == Operand::LifeTime::NO_VALUE) {
762                     LOG(ERROR) << operation.type << " " << type << " operand " << i
763                                << " is required but missing.";
764                     return false;
765                 }
766             }
767             return true;
768         };
769 
770         auto verifyNoZeroSizedInputs = [&operation,
771                                         &operands](const std::vector<uint32_t>& indexes) {
772             for (size_t i = 0; i < indexes.size(); i++) {
773                 for (size_t j = 0; j < operands[indexes[i]].dimensions.size(); j++) {
774                     if (operands[indexes[i]].dimensions[j] == 0) {
775                         LOG(ERROR) << operation.type
776                                    << " does not support zero-sized tensor, but input " << i
777                                    << " dimension " << j << " is zero.";
778                         return false;
779                     }
780                 }
781             }
782             return true;
783         };
784 
785         return verify(requiredIns, ins, "in") && verify(requiredOuts, outs, "out") &&
786                verifyNoZeroSizedInputs(ins);
787     };
788 
789     switch (operation.type) {
790         case OperationType::OEM_OPERATION: {
791             LOG(ERROR) << "OEM operation not supported for CPU execution";
792             success = false;
793         } break;
794         case OperationType::RESHAPE: {
795             if (!allParametersPresent(2, 1)) {
796                 return ANEURALNETWORKS_BAD_DATA;
797             }
798             const RunTimeOperandInfo& input = operands[ins[0]];
799             const RunTimeOperandInfo& targetShape = operands[ins[1]];
800 
801             RunTimeOperandInfo& output = operands[outs[0]];
802             Shape outShape = output.shape();
803 
804             success = reshapePrepare(input.shape(),
805                                      reinterpret_cast<const int32_t*>(targetShape.buffer),
806                                      getNumberOfElements(targetShape.shape()), &outShape) &&
807                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
808                       copyData(input.buffer, input.shape(), output.buffer, outShape);
809         } break;
810         case OperationType::DEPTH_TO_SPACE: {
811             const size_t inCount = ins.size();
812             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
813                 return ANEURALNETWORKS_BAD_DATA;
814             }
815             const RunTimeOperandInfo& input = operands[ins[0]];
816             int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
817             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
818 
819             RunTimeOperandInfo& output = operands[outs[0]];
820             Shape outShape = output.shape();
821 
822             RunTimeOperandInfo input_tmp, output_tmp;
823             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
824             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
825                 success = false;
826                 break;
827             }
828             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
829             output_tmp.buffer = data_layout ? nullptr : output.buffer;
830             output_tmp.length = data_layout ? 0 : output.length;
831             if (!depthToSpacePrepare(input_tmp.shape(), blockSize, &outShape) ||
832                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
833                 if (!data_layout) output.dimensions = output_tmp.dimensions;
834                 break;
835             }
836             switch (input_tmp.type) {
837                 case OperandType::TENSOR_FLOAT32: {
838                     success = depthToSpaceGeneric(
839                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
840                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
841                     break;
842                 }
843                 case OperandType::TENSOR_FLOAT16: {
844                     success = depthToSpaceGeneric(
845                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
846                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
847                     break;
848                 }
849                 case OperandType::TENSOR_QUANT8_ASYMM: {
850                     success = depthToSpaceGeneric(
851                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
852                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
853                     break;
854                 }
855                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
856                     success = depthToSpaceGeneric(
857                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
858                             blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
859                     break;
860                 }
861                 default: {
862                     LOG(ERROR) << "Unsupported data type";
863                     success = false;
864                 }
865             }
866             if (data_layout) {
867                 output_tmp_guard.reset(output_tmp.buffer);
868             }
869             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
870                 success = false;
871                 break;
872             }
873         } break;
874         case OperationType::SPACE_TO_DEPTH: {
875             const size_t inCount = ins.size();
876             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
877                 return ANEURALNETWORKS_BAD_DATA;
878             }
879             const RunTimeOperandInfo& input = operands[ins[0]];
880             int32_t blockSize = getScalarData<int32_t>(operands[ins[1]]);
881             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
882 
883             RunTimeOperandInfo& output = operands[outs[0]];
884             Shape outShape = output.shape();
885 
886             RunTimeOperandInfo input_tmp, output_tmp;
887             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
888             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
889                 success = false;
890                 break;
891             }
892             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
893             output_tmp.buffer = data_layout ? nullptr : output.buffer;
894             output_tmp.length = data_layout ? 0 : output.length;
895 
896             if (!spaceToDepthPrepare(input_tmp.shape(), blockSize, &outShape) ||
897                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
898                 if (!data_layout) output.dimensions = output_tmp.dimensions;
899                 break;
900             }
901             switch (input_tmp.type) {
902                 case OperandType::TENSOR_FLOAT32: {
903                     success = spaceToDepthGeneric(
904                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
905                             blockSize, reinterpret_cast<float*>(output_tmp.buffer), outShape);
906                     break;
907                 }
908                 case OperandType::TENSOR_FLOAT16: {
909                     success = spaceToDepthGeneric(
910                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
911                             blockSize, reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
912                     break;
913                 }
914                 case OperandType::TENSOR_QUANT8_ASYMM: {
915                     success = spaceToDepthGeneric(
916                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
917                             blockSize, reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
918                     break;
919                 }
920                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
921                     success = spaceToDepthGeneric(
922                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
923                             blockSize, reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
924                     break;
925                 }
926                 default: {
927                     LOG(ERROR) << "Unsupported data type";
928                     success = false;
929                 }
930             }
931             if (data_layout) {
932                 output_tmp_guard.reset(output_tmp.buffer);
933             }
934             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
935                 success = false;
936                 break;
937             }
938         } break;
939         case OperationType::EMBEDDING_LOOKUP: {
940             if (!allParametersPresent(2, 1)) {
941                 return ANEURALNETWORKS_BAD_DATA;
942             }
943             const RunTimeOperandInfo& values = operands[ins[EmbeddingLookup::kValueTensor]];
944             const RunTimeOperandInfo& lookups = operands[ins[EmbeddingLookup::kLookupTensor]];
945             RunTimeOperandInfo& output = operands[outs[EmbeddingLookup::kOutputTensor]];
946 
947             Shape outputShape;
948             EmbeddingLookup lookup(operation, operands);
949 
950             success = embeddingLookupPrepare(values.shape(), lookups.shape(), &outputShape) &&
951                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lookup.Eval();
952         } break;
953         case OperationType::HASHTABLE_LOOKUP: {
954             if (!allParametersPresent(3, 2)) {
955                 return ANEURALNETWORKS_BAD_DATA;
956             }
957             const RunTimeOperandInfo& lookups = operands[ins[HashtableLookup::kLookupTensor]];
958             const RunTimeOperandInfo& keys = operands[ins[HashtableLookup::kKeyTensor]];
959             const RunTimeOperandInfo& values = operands[ins[HashtableLookup::kValueTensor]];
960 
961             RunTimeOperandInfo& output = operands[outs[HashtableLookup::kOutputTensor]];
962             RunTimeOperandInfo& hits = operands[outs[HashtableLookup::kHitsTensor]];
963 
964             Shape outputShape, hitShape;
965             HashtableLookup lookup(operation, operands);
966 
967             success = hashtableLookupPrepare(lookups.shape(), keys.shape(), values.shape(),
968                                              &outputShape, &hitShape) &&
969                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
970                       setInfoAndAllocateIfNeeded(&hits, hitShape, &result) && lookup.Eval();
971         } break;
972         case OperationType::LSH_PROJECTION: {
973             RunTimeOperandInfo& output = operands[outs[LSHProjection::kOutputTensor]];
974             Shape outputShape;
975             if (!LSHProjection::Prepare(operation, operands, &outputShape) ||
976                 !setInfoAndAllocateIfNeeded(&output, outputShape, &result)) {
977                 break;
978             }
979 
980             LSHProjection lsh(operation, operands);
981             const RunTimeOperandInfo& hash = operands[ins[LSHProjection::kHashTensor]];
982             switch (hash.type) {
983                 case OperandType::TENSOR_FLOAT32: {
984                     success = lsh.Eval<float>();
985                     break;
986                 }
987                 case OperandType::TENSOR_FLOAT16: {
988                     success = lsh.Eval<_Float16>();
989                     break;
990                 }
991                 default: {
992                     success = false;
993                     LOG(ERROR) << "Unsupported data type";
994                 }
995             }
996         } break;
997         case OperationType::BIDIRECTIONAL_SEQUENCE_LSTM: {
998             const auto merge_outputs = getScalarData<bool>(
999                     operands[ins[BidirectionalSequenceLSTM::kMergeOutputsParam]]);
1000             const bool output_state = (outs.size() == 5 || outs.size() == 6);
1001             RunTimeOperandInfo& fwOutput =
1002                     operands[outs[BidirectionalSequenceLSTM::kFwOutputTensor]];
1003             Shape fwOutputShape, bwOutputShape, fwOutputActivationStateShape,
1004                     fwOutputCellStateShape, bwOutputActivationStateShape, bwOutputCellStateShape;
1005 
1006             BidirectionalSequenceLSTM lstm(operation, operands);
1007             success = lstm.Prepare(operation, operands, &fwOutputShape, &bwOutputShape,
1008                                    &fwOutputActivationStateShape, &fwOutputCellStateShape,
1009                                    &bwOutputActivationStateShape, &bwOutputCellStateShape) &&
1010                       setInfoAndAllocateIfNeeded(&fwOutput, fwOutputShape, &result);
1011             if (!merge_outputs) {
1012                 RunTimeOperandInfo& bwOutput =
1013                         operands[outs[BidirectionalSequenceLSTM::kBwOutputTensor]];
1014                 success = success && setInfoAndAllocateIfNeeded(&bwOutput, bwOutputShape, &result);
1015             }
1016             if (output_state) {
1017                 uint32_t delta = merge_outputs ? 1 : 0;
1018                 RunTimeOperandInfo& fwOutputActivationState =
1019                         operands[outs[BidirectionalSequenceLSTM::kFwOutputActivationStateTensor -
1020                                       delta]];
1021                 RunTimeOperandInfo& fwOutputCellState =
1022                         operands[outs[BidirectionalSequenceLSTM::kFwOutputCellStateTensor - delta]];
1023                 RunTimeOperandInfo& bwOutputActivationState =
1024                         operands[outs[BidirectionalSequenceLSTM::kBwOutputActivationStateTensor -
1025                                       delta]];
1026                 RunTimeOperandInfo& bwOutputCellState =
1027                         operands[outs[BidirectionalSequenceLSTM::kBwOutputCellStateTensor - delta]];
1028                 success = success &&
1029                           setInfoAndAllocateIfNeeded(&fwOutputActivationState,
1030                                                      fwOutputActivationStateShape, &result) &&
1031                           setInfoAndAllocateIfNeeded(&fwOutputCellState, fwOutputCellStateShape,
1032                                                      &result) &&
1033                           setInfoAndAllocateIfNeeded(&bwOutputActivationState,
1034                                                      bwOutputActivationStateShape, &result) &&
1035                           setInfoAndAllocateIfNeeded(&bwOutputCellState, bwOutputCellStateShape,
1036                                                      &result);
1037             }
1038             success = success && lstm.Eval();
1039         } break;
1040         case OperationType::LSTM: {
1041             RunTimeOperandInfo& scratch = operands[outs[LSTMCell::kScratchBufferTensor]];
1042             RunTimeOperandInfo& outputStateOut = operands[outs[LSTMCell::kOutputStateOutTensor]];
1043             RunTimeOperandInfo& cellStateOut = operands[outs[LSTMCell::kCellStateOutTensor]];
1044             RunTimeOperandInfo& output = operands[outs[LSTMCell::kOutputTensor]];
1045 
1046             Shape scratchShape, outputStateShape, cellStateShape, outputShape;
1047             LSTMCell lstm_cell(operation, operands);
1048 
1049             success = lstm_cell.Prepare(operation, operands, &scratchShape, &outputStateShape,
1050                                         &cellStateShape, &outputShape) &&
1051                       setInfoAndAllocateIfNeeded(&scratch, scratchShape, &result) &&
1052                       setInfoAndAllocateIfNeeded(&outputStateOut, outputStateShape, &result) &&
1053                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateShape, &result) &&
1054                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && lstm_cell.Eval();
1055         } break;
1056         case OperationType::RANDOM_MULTINOMIAL: {
1057             if (!allParametersPresent(3, 1)) {
1058                 return ANEURALNETWORKS_BAD_DATA;
1059             }
1060             RunTimeOperandInfo& output = operands[outs[Multinomial::kOutputTensor]];
1061 
1062             Shape outputShape;
1063             Multinomial multinomial(operation, operands);
1064 
1065             success = Multinomial::Prepare(operation, operands, &outputShape) &&
1066                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1067                       multinomial.Eval();
1068         } break;
1069         case OperationType::RNN: {
1070             if (!allParametersPresent(6, 2)) {
1071                 return ANEURALNETWORKS_BAD_DATA;
1072             }
1073 
1074             RunTimeOperandInfo& hiddenStateOut = operands[outs[RNN::kHiddenStateOutTensor]];
1075             RunTimeOperandInfo& output = operands[outs[RNN::kOutputTensor]];
1076 
1077             Shape hiddenStateShape, outputShape;
1078             RNN rnn_cell(operation, operands);
1079 
1080             success = RNN::Prepare(operation, operands, &hiddenStateShape, &outputShape) &&
1081                       setInfoAndAllocateIfNeeded(&hiddenStateOut, hiddenStateShape, &result) &&
1082                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && rnn_cell.Eval();
1083         } break;
1084         case OperationType::SVDF: {
1085             RunTimeOperandInfo& stateOut = operands[outs[SVDF::kStateOutTensor]];
1086             RunTimeOperandInfo& output = operands[outs[SVDF::kOutputTensor]];
1087 
1088             Shape stateShape, outputShape;
1089             SVDF svdf(operation, operands);
1090 
1091             success = SVDF::Prepare(operation, operands, &stateShape, &outputShape) &&
1092                       setInfoAndAllocateIfNeeded(&stateOut, stateShape, &result) &&
1093                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) && svdf.Eval();
1094         } break;
1095         case OperationType::BATCH_TO_SPACE_ND: {
1096             const size_t inCount = ins.size();
1097             if ((inCount != 3 && inCount != 2) || !allParametersPresent(inCount, 1)) {
1098                 return ANEURALNETWORKS_BAD_DATA;
1099             }
1100             const RunTimeOperandInfo& input = operands[ins[0]];
1101             const RunTimeOperandInfo& blockSize = operands[ins[1]];
1102             bool data_layout = inCount == 3 ? getScalarData<bool>(operands[ins[2]]) : false;
1103 
1104             RunTimeOperandInfo& output = operands[outs[0]];
1105             Shape outShape = output.shape();
1106 
1107             RunTimeOperandInfo input_tmp, output_tmp;
1108             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1109             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1110                 success = false;
1111                 break;
1112             }
1113             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1114             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1115             output_tmp.length = data_layout ? 0 : output.length;
1116 
1117             if (!batchToSpacePrepare(input_tmp.shape(),
1118                                      reinterpret_cast<const int32_t*>(blockSize.buffer),
1119                                      blockSize.shape(), &outShape) ||
1120                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1121                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1122                 break;
1123             }
1124             switch (input_tmp.type) {
1125                 case OperandType::TENSOR_FLOAT32: {
1126                     success = batchToSpaceGeneric(
1127                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1128                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1129                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1130                     break;
1131                 }
1132                 case OperandType::TENSOR_FLOAT16: {
1133                     success = batchToSpaceGeneric(
1134                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1135                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1136                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1137                     break;
1138                 }
1139                 case OperandType::TENSOR_QUANT8_ASYMM: {
1140                     success = batchToSpaceGeneric(
1141                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1142                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1143                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1144                     break;
1145                 }
1146                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1147                     success = batchToSpaceGeneric(
1148                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1149                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1150                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1151                     break;
1152                 }
1153                 default: {
1154                     LOG(ERROR) << "Unsupported data type";
1155                     success = false;
1156                 }
1157             }
1158             if (data_layout) {
1159                 output_tmp_guard.reset(output_tmp.buffer);
1160             }
1161             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1162                 success = false;
1163                 break;
1164             }
1165         } break;
1166         case OperationType::SPACE_TO_BATCH_ND: {
1167             const size_t inCount = ins.size();
1168             if ((inCount != 4 && inCount != 3) || !allParametersPresent(inCount, 1)) {
1169                 return ANEURALNETWORKS_BAD_DATA;
1170             }
1171             const RunTimeOperandInfo& input = operands[ins[0]];
1172             const RunTimeOperandInfo& blockSize = operands[ins[1]];
1173             const RunTimeOperandInfo& paddings = operands[ins[2]];
1174             bool data_layout = inCount == 4 ? getScalarData<bool>(operands[ins[3]]) : false;
1175 
1176             RunTimeOperandInfo& output = operands[outs[0]];
1177             Shape outShape = output.shape();
1178 
1179             RunTimeOperandInfo input_tmp, output_tmp;
1180             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1181             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1182                 success = false;
1183                 break;
1184             }
1185             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1186             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1187             output_tmp.length = data_layout ? 0 : output.length;
1188 
1189             if (!spaceToBatchPrepare(
1190                         input_tmp.shape(), reinterpret_cast<const int32_t*>(blockSize.buffer),
1191                         blockSize.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1192                         paddings.shape(), &outShape) ||
1193                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1194                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1195                 break;
1196             }
1197             switch (input_tmp.type) {
1198                 case OperandType::TENSOR_FLOAT32: {
1199                     success = spaceToBatchGeneric(
1200                             reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1201                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1202                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1203                             reinterpret_cast<float*>(output_tmp.buffer), outShape);
1204                     break;
1205                 }
1206                 case OperandType::TENSOR_FLOAT16: {
1207                     success = spaceToBatchGeneric(
1208                             reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1209                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1210                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1211                             reinterpret_cast<_Float16*>(output_tmp.buffer), outShape);
1212                     break;
1213                 }
1214                 case OperandType::TENSOR_QUANT8_ASYMM: {
1215                     success = spaceToBatchGeneric(
1216                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1217                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1218                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1219                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1220                     break;
1221                 }
1222                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1223                     success = spaceToBatchGeneric(
1224                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1225                             reinterpret_cast<const int32_t*>(blockSize.buffer),
1226                             reinterpret_cast<const int32_t*>(paddings.buffer), paddings.shape(),
1227                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1228                     break;
1229                 }
1230                 default: {
1231                     LOG(ERROR) << "Unsupported data type";
1232                     success = false;
1233                 }
1234             }
1235             if (data_layout) {
1236                 output_tmp_guard.reset(output_tmp.buffer);
1237             }
1238             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1239                 success = false;
1240                 break;
1241             }
1242         } break;
1243         case OperationType::PAD:
1244         case OperationType::PAD_V2: {
1245             const bool isV2 = operation.type == OperationType::PAD_V2;
1246             if (!allParametersPresent(isV2 ? 3 : 2, 1)) {
1247                 return ANEURALNETWORKS_BAD_DATA;
1248             }
1249             const RunTimeOperandInfo& input = operands[ins[0]];
1250             const RunTimeOperandInfo& paddings = operands[ins[1]];
1251 
1252             RunTimeOperandInfo& output = operands[outs[0]];
1253             Shape outShape = output.shape();
1254 
1255             if (!padPrepare(input.shape(), reinterpret_cast<const int32_t*>(paddings.buffer),
1256                             paddings.shape(), &outShape) ||
1257                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1258                 break;
1259             }
1260             if (input.type == OperandType::TENSOR_FLOAT32) {
1261                 float pad_value = isV2 ? getScalarData<float>(operands[ins[2]]) : 0;
1262                 success = padGeneric(reinterpret_cast<const float*>(input.buffer), input.shape(),
1263                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1264                                      reinterpret_cast<float*>(output.buffer), outShape);
1265             } else if (input.type == OperandType::TENSOR_FLOAT16) {
1266                 _Float16 pad_value = isV2 ? getScalarData<_Float16>(operands[ins[2]]) : 0;
1267                 success = padGeneric(reinterpret_cast<const _Float16*>(input.buffer), input.shape(),
1268                                      reinterpret_cast<const int32_t*>(paddings.buffer),
1269                                      static_cast<_Float16>(pad_value),
1270                                      reinterpret_cast<_Float16*>(output.buffer), outShape);
1271             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1272                 uint8_t pad_value =
1273                         isV2 ? getScalarData<uint8_t>(operands[ins[2]]) : outShape.offset;
1274                 success = padGeneric(input.buffer, input.shape(),
1275                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1276                                      output.buffer, outShape);
1277             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1278                 uint8_t pad_value =
1279                         isV2 ? getScalarData<int8_t>(operands[ins[2]]) : outShape.offset;
1280                 success = padGeneric(input.buffer, input.shape(),
1281                                      reinterpret_cast<const int32_t*>(paddings.buffer), pad_value,
1282                                      output.buffer, outShape);
1283             }
1284         } break;
1285         case OperationType::CAST: {
1286             if (!allParametersPresent(1, 1)) {
1287                 return ANEURALNETWORKS_BAD_DATA;
1288             }
1289             const RunTimeOperandInfo& input = operands[ins[0]];
1290 
1291             RunTimeOperandInfo& output = operands[outs[0]];
1292             Shape outShape = output.shape();
1293 
1294             success = cast::prepare(input.shape(), &outShape) &&
1295                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1296                       cast::eval(input.buffer, input.shape(), output.buffer, outShape);
1297         } break;
1298         case OperationType::MEAN: {
1299             if (!allParametersPresent(3, 1)) {
1300                 return ANEURALNETWORKS_BAD_DATA;
1301             }
1302             const RunTimeOperandInfo& input = operands[ins[0]];
1303             const RunTimeOperandInfo& axis = operands[ins[1]];
1304             int32_t keepDims = getScalarData<int32_t>(operands[ins[2]]);
1305 
1306             RunTimeOperandInfo& output = operands[outs[0]];
1307             Shape outShape = output.shape();
1308 
1309             if (!meanPrepare(input.shape(), reinterpret_cast<const int32_t*>(axis.buffer),
1310                              axis.shape(), keepDims > 0, &outShape) ||
1311                 !setInfoAndAllocateIfNeeded(&output, outShape, &result)) {
1312                 break;
1313             }
1314             if (input.type == OperandType::TENSOR_FLOAT16) {
1315                 success = meanFloat16(reinterpret_cast<_Float16*>(input.buffer), input.shape(),
1316                                       reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(),
1317                                       keepDims > 0, reinterpret_cast<_Float16*>(output.buffer),
1318                                       outShape);
1319             } else if (input.type == OperandType::TENSOR_FLOAT32) {
1320                 success = meanGeneric<float, float>(
1321                         reinterpret_cast<float*>(input.buffer), input.shape(),
1322                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1323                         reinterpret_cast<float*>(output.buffer), outShape);
1324             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM) {
1325                 success = meanGeneric<uint8_t, int32_t>(
1326                         reinterpret_cast<uint8_t*>(input.buffer), input.shape(),
1327                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1328                         reinterpret_cast<uint8_t*>(output.buffer), outShape);
1329             } else if (input.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1330                 success = meanGeneric<int8_t, int32_t>(
1331                         reinterpret_cast<int8_t*>(input.buffer), input.shape(),
1332                         reinterpret_cast<const int32_t*>(axis.buffer), axis.shape(), keepDims > 0,
1333                         reinterpret_cast<int8_t*>(output.buffer), outShape);
1334             }
1335         } break;
1336         case OperationType::ARGMAX:
1337         case OperationType::ARGMIN: {
1338             if (!allParametersPresent(2, 1)) {
1339                 return ANEURALNETWORKS_BAD_DATA;
1340             }
1341             const RunTimeOperandInfo& input = operands[ins[0]];
1342             int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1343 
1344             RunTimeOperandInfo& output = operands[outs[0]];
1345             Shape outShape = output.shape();
1346 
1347             const bool isArgMin = operation.type == OperationType::ARGMIN;
1348             success = argMinMaxPrepare(input.shape(), axis, &outShape) &&
1349                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1350                       argMinMaxGeneric(input.buffer, input.shape(), axis, isArgMin, output.buffer,
1351                                        outShape);
1352         } break;
1353         case OperationType::EXPAND_DIMS: {
1354             if (!allParametersPresent(2, 1)) {
1355                 return ANEURALNETWORKS_BAD_DATA;
1356             }
1357             const RunTimeOperandInfo& input = operands[ins[0]];
1358             int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1359 
1360             RunTimeOperandInfo& output = operands[outs[0]];
1361             Shape outShape = output.shape();
1362 
1363             success = expand_dims::prepare(input.shape(), axis, &outShape) &&
1364                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1365                       expand_dims::eval(input.buffer, input.shape(), axis, output.buffer, outShape);
1366         } break;
1367         case OperationType::SPLIT: {
1368             const size_t outCount = outs.size();
1369             if (!allParametersPresent(3, outCount)) {
1370                 return ANEURALNETWORKS_BAD_DATA;
1371             }
1372 
1373             const RunTimeOperandInfo& input = operands[ins[0]];
1374             const int32_t axis = getScalarData<int32_t>(operands[ins[1]]);
1375             const int32_t numOutputs = getScalarData<int32_t>(operands[ins[2]]);
1376 
1377             if (numOutputs != outs.size()) {
1378                 return ANEURALNETWORKS_BAD_DATA;
1379             }
1380 
1381             std::vector<Shape> outputShapes(numOutputs);
1382             for (int i = 0; i < numOutputs; ++i) {
1383                 outputShapes[i] = operands[outs[i]].shape();
1384             }
1385 
1386             success = splitPrepare(input.shape(), axis, numOutputs, &outputShapes);
1387             for (int i = 0; i < numOutputs; ++i) {
1388                 success = success && setInfoAndAllocateIfNeeded(&(operands[outs[i]]),
1389                                                                 outputShapes[i], &result);
1390             }
1391             switch (input.type) {
1392                 case OperandType::TENSOR_FLOAT16: {
1393                     std::vector<_Float16*> outputDataPtrs(numOutputs);
1394                     for (int i = 0; i < numOutputs; ++i) {
1395                         outputDataPtrs[i] = reinterpret_cast<_Float16*>(operands[outs[i]].buffer);
1396                     }
1397                     success = success &&
1398                               splitFloat16(reinterpret_cast<const _Float16*>(input.buffer),
1399                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1400                 } break;
1401                 case OperandType::TENSOR_FLOAT32: {
1402                     std::vector<float*> outputDataPtrs(numOutputs);
1403                     for (int i = 0; i < numOutputs; ++i) {
1404                         outputDataPtrs[i] = reinterpret_cast<float*>(operands[outs[i]].buffer);
1405                     }
1406                     success = success &&
1407                               splitFloat32(reinterpret_cast<const float*>(input.buffer),
1408                                            input.shape(), axis, &outputDataPtrs, outputShapes);
1409                 } break;
1410                 case OperandType::TENSOR_INT32: {
1411                     std::vector<int32_t*> outputDataPtrs(numOutputs);
1412                     for (int i = 0; i < numOutputs; ++i) {
1413                         outputDataPtrs[i] = reinterpret_cast<int32_t*>(operands[outs[i]].buffer);
1414                     }
1415                     success = success &&
1416                               splitInt32(reinterpret_cast<const int32_t*>(input.buffer),
1417                                          input.shape(), axis, &outputDataPtrs, outputShapes);
1418                 } break;
1419                 case OperandType::TENSOR_QUANT8_ASYMM: {
1420                     std::vector<uint8_t*> outputDataPtrs(numOutputs);
1421                     for (int i = 0; i < numOutputs; ++i) {
1422                         outputDataPtrs[i] = reinterpret_cast<uint8_t*>(operands[outs[i]].buffer);
1423                     }
1424                     success = success &&
1425                               splitQuant8(reinterpret_cast<const uint8_t*>(input.buffer),
1426                                           input.shape(), axis, &outputDataPtrs, outputShapes);
1427                 } break;
1428                 case OperandType::TENSOR_QUANT8_ASYMM_SIGNED: {
1429                     std::vector<int8_t*> outputDataPtrs(numOutputs);
1430                     for (int i = 0; i < numOutputs; ++i) {
1431                         outputDataPtrs[i] = reinterpret_cast<int8_t*>(operands[outs[i]].buffer);
1432                     }
1433                     success = success &&
1434                               splitQuant8Signed(reinterpret_cast<const int8_t*>(input.buffer),
1435                                                 input.shape(), axis, &outputDataPtrs, outputShapes);
1436                 } break;
1437                 default: {
1438                     return ANEURALNETWORKS_BAD_DATA;
1439                 }
1440             }
1441         } break;
1442         case OperationType::MAXIMUM:
1443         case OperationType::MINIMUM: {
1444             if (!allParametersPresent(2, 1)) {
1445                 return ANEURALNETWORKS_BAD_DATA;
1446             }
1447             const RunTimeOperandInfo& in1 = operands[ins[0]];
1448             const RunTimeOperandInfo& in2 = operands[ins[1]];
1449 
1450             RunTimeOperandInfo& output = operands[outs[0]];
1451             Shape outputShape = output.shape();
1452 
1453             const bool isMinimum = operation.type == OperationType::MINIMUM;
1454             success = maximum_minimum::prepare(in1.shape(), in2.shape(), &outputShape) &&
1455                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1456                       maximum_minimum::eval(in1.buffer, in1.shape(), in2.buffer, in2.shape(),
1457                                             isMinimum, output.buffer, outputShape);
1458         } break;
1459         case OperationType::GROUPED_CONV_2D: {
1460             const size_t inCount = ins.size();
1461             if ((inCount != 12 && inCount != 9) || !allParametersPresent(inCount, 1)) {
1462                 return ANEURALNETWORKS_BAD_DATA;
1463             }
1464             const RunTimeOperandInfo& input = operands[ins[0]];
1465             const RunTimeOperandInfo& filter = operands[ins[1]];
1466             const RunTimeOperandInfo& bias = operands[ins[2]];
1467 
1468             int32_t padding_left, padding_right;
1469             int32_t padding_top, padding_bottom;
1470             int32_t padding_implicit = 0;
1471             int32_t stride_width, stride_height;
1472             int32_t numGroups;
1473             int32_t activation;
1474             bool data_layout = false;
1475 
1476             if (inCount == 12) {
1477                 padding_left = getScalarData<int32_t>(operands[ins[3]]);
1478                 padding_right = getScalarData<int32_t>(operands[ins[4]]);
1479                 padding_top = getScalarData<int32_t>(operands[ins[5]]);
1480                 padding_bottom = getScalarData<int32_t>(operands[ins[6]]);
1481                 stride_width = getScalarData<int32_t>(operands[ins[7]]);
1482                 stride_height = getScalarData<int32_t>(operands[ins[8]]);
1483                 numGroups = getScalarData<int32_t>(operands[ins[9]]);
1484                 activation = getScalarData<int32_t>(operands[ins[10]]);
1485                 data_layout = getScalarData<bool>(operands[ins[11]]);
1486             } else {
1487                 padding_implicit = getScalarData<int32_t>(operands[ins[3]]);
1488                 stride_width = getScalarData<int32_t>(operands[ins[4]]);
1489                 stride_height = getScalarData<int32_t>(operands[ins[5]]);
1490                 numGroups = getScalarData<int32_t>(operands[ins[6]]);
1491                 activation = getScalarData<int32_t>(operands[ins[7]]);
1492                 data_layout = getScalarData<bool>(operands[ins[8]]);
1493             }
1494 
1495             RunTimeOperandInfo& output = operands[outs[0]];
1496             Shape outShape = output.shape();
1497 
1498             RunTimeOperandInfo input_tmp, output_tmp;
1499             std::unique_ptr<uint8_t[]> input_tmp_guard, output_tmp_guard;
1500             if (!convertToNhwc(input_tmp, input, input_tmp_guard, data_layout)) {
1501                 success = false;
1502                 break;
1503             }
1504             output_tmp.lifetime = Operand::LifeTime::TEMPORARY_VARIABLE;
1505             output_tmp.buffer = data_layout ? nullptr : output.buffer;
1506             output_tmp.length = data_layout ? 0 : output.length;
1507 
1508             if (inCount == 9) {
1509                 Shape inputShape = input_tmp.shape();
1510                 Shape filterShape = filter.shape();
1511                 int32_t input_width = getSizeOfDimension(inputShape, 2);
1512                 int32_t input_height = getSizeOfDimension(inputShape, 1);
1513                 int32_t filter_width = getSizeOfDimension(filterShape, 2);
1514                 int32_t filter_height = getSizeOfDimension(filterShape, 1);
1515                 calculateExplicitPadding(input_width, stride_width, filter_width, padding_implicit,
1516                                          &padding_left, &padding_right);
1517                 calculateExplicitPadding(input_height, stride_height, filter_height,
1518                                          padding_implicit, &padding_top, &padding_bottom);
1519             }
1520 
1521             if (!groupedConvPrepare(input_tmp.shape(), filter.shape(), bias.shape(), padding_left,
1522                                     padding_right, padding_top, padding_bottom, stride_width,
1523                                     stride_height, numGroups, &outShape) ||
1524                 !setInfoAndAllocateIfNeeded(&output_tmp, outShape, &result)) {
1525                 if (!data_layout) output.dimensions = output_tmp.dimensions;
1526                 success = false;
1527                 break;
1528             }
1529 
1530             if (input_tmp.type == OperandType::TENSOR_FLOAT32) {
1531                 success = groupedConvFloat32(
1532                         reinterpret_cast<const float*>(input_tmp.buffer), input_tmp.shape(),
1533                         reinterpret_cast<const float*>(filter.buffer), filter.shape(),
1534                         reinterpret_cast<const float*>(bias.buffer), bias.shape(), padding_left,
1535                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1536                         numGroups, activation, reinterpret_cast<float*>(output_tmp.buffer),
1537                         outShape);
1538             } else if (input_tmp.type == OperandType::TENSOR_FLOAT16) {
1539                 success = groupedConvFloat16(
1540                         reinterpret_cast<const _Float16*>(input_tmp.buffer), input_tmp.shape(),
1541                         reinterpret_cast<const _Float16*>(filter.buffer), filter.shape(),
1542                         reinterpret_cast<const _Float16*>(bias.buffer), bias.shape(), padding_left,
1543                         padding_right, padding_top, padding_bottom, stride_width, stride_height,
1544                         numGroups, activation, reinterpret_cast<_Float16*>(output_tmp.buffer),
1545                         outShape);
1546             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM) {
1547                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1548                     success = groupedConvQuant8PerChannel(
1549                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1550                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1551                             std::get<Operand::SymmPerChannelQuantParams>(filter.extraParams)
1552                                     .scales.data(),
1553                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1554                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1555                             stride_height, numGroups, activation,
1556                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1557                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM) {
1558                     success = groupedConvQuant8(
1559                             reinterpret_cast<const uint8_t*>(input_tmp.buffer), input_tmp.shape(),
1560                             reinterpret_cast<const uint8_t*>(filter.buffer), filter.shape(),
1561                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1562                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1563                             stride_height, numGroups, activation,
1564                             reinterpret_cast<uint8_t*>(output_tmp.buffer), outShape);
1565                 }
1566             } else if (input_tmp.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1567                 if (filter.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) {
1568                     success = groupedConvQuant8PerChannel(
1569                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1570                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1571                             std::get<Operand::SymmPerChannelQuantParams>(filter.extraParams)
1572                                     .scales.data(),
1573                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1574                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1575                             stride_height, numGroups, activation,
1576                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1577                 } else if (filter.type == OperandType::TENSOR_QUANT8_ASYMM_SIGNED) {
1578                     success = groupedConvQuant8(
1579                             reinterpret_cast<const int8_t*>(input_tmp.buffer), input_tmp.shape(),
1580                             reinterpret_cast<const int8_t*>(filter.buffer), filter.shape(),
1581                             reinterpret_cast<const int32_t*>(bias.buffer), bias.shape(),
1582                             padding_left, padding_right, padding_top, padding_bottom, stride_width,
1583                             stride_height, numGroups, activation,
1584                             reinterpret_cast<int8_t*>(output_tmp.buffer), outShape);
1585                 }
1586             }
1587 
1588             if (data_layout) {
1589                 output_tmp_guard.reset(output_tmp.buffer);
1590             }
1591             if (!success || !convertFromNhwc(output, output_tmp, data_layout, &result)) {
1592                 success = false;
1593                 break;
1594             }
1595         } break;
1596         case OperationType::TILE: {
1597             if (!allParametersPresent(2, 1)) {
1598                 return ANEURALNETWORKS_BAD_DATA;
1599             }
1600             const RunTimeOperandInfo& input = operands[ins[0]];
1601             const RunTimeOperandInfo& multiples = operands[ins[1]];
1602 
1603             RunTimeOperandInfo& output = operands[outs[0]];
1604             Shape outShape = output.shape();
1605 
1606             success =
1607                     tile::prepare(input.shape(), reinterpret_cast<const int32_t*>(multiples.buffer),
1608                                   multiples.shape(), &outShape) &&
1609                     setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1610                     tile::eval(input.buffer, input.shape(),
1611                                reinterpret_cast<const int32_t*>(multiples.buffer), output.buffer,
1612                                outShape);
1613         } break;
1614         case OperationType::QUANTIZED_16BIT_LSTM: {
1615             if (!allParametersPresent(15, 2)) {
1616                 return ANEURALNETWORKS_BAD_DATA;
1617             }
1618 
1619             RunTimeOperandInfo& cellStateOut =
1620                     operands[outs[QuantizedLSTMCell::kCellStateOutTensor]];
1621             RunTimeOperandInfo& output = operands[outs[QuantizedLSTMCell::kOutputTensor]];
1622 
1623             Shape cellStateOutShape, outputShape;
1624             QuantizedLSTMCell quantizedLSTMCell(operation, operands);
1625 
1626             success = QuantizedLSTMCell::prepare(operation, operands, &cellStateOutShape,
1627                                                  &outputShape) &&
1628                       setInfoAndAllocateIfNeeded(&cellStateOut, cellStateOutShape, &result) &&
1629                       setInfoAndAllocateIfNeeded(&output, outputShape, &result) &&
1630                       quantizedLSTMCell.eval();
1631         } break;
1632         case OperationType::POW: {
1633             if (!allParametersPresent(2, 1)) {
1634                 return ANEURALNETWORKS_BAD_DATA;
1635             }
1636             const RunTimeOperandInfo& base = operands[ins[0]];
1637             const RunTimeOperandInfo& exponent = operands[ins[1]];
1638 
1639             RunTimeOperandInfo& output = operands[outs[0]];
1640             Shape outShape = output.shape();
1641 
1642             success = pow::prepare(base.shape(), exponent.shape(), &outShape) &&
1643                       setInfoAndAllocateIfNeeded(&output, outShape, &result) &&
1644                       pow::eval(base.buffer, base.shape(), exponent.buffer, exponent.shape(),
1645                                 output.buffer, outShape);
1646         } break;
1647         default: {
1648             const OperationRegistration* operationRegistration =
1649                     mOperationResolver->findOperation(operation.type);
1650             if (operationRegistration == nullptr) {
1651                 LOG(ERROR) << operation.type << " not registered";
1652             } else if (operationRegistration->prepare == nullptr ||
1653                        operationRegistration->execute == nullptr) {
1654                 LOG(ERROR) << "Incomplete operation registration: " << operation.type;
1655             } else {
1656                 OperationExecutionContext context(&operation, operands);
1657                 success = operationRegistration->flags.allowOmittedOperand ||
1658                           context.checkNoOmittedOperand();
1659                 success = success && (operationRegistration->flags.allowZeroSizedInput ||
1660                                       context.checkNoZeroSizedInput());
1661                 success = success && operationRegistration->prepare(&context) &&
1662                           operationRegistration->execute(&context);
1663                 result = context.getResultCode();
1664             }
1665         }
1666     }
1667     if (!success && result == ANEURALNETWORKS_NO_ERROR) {
1668         result = ANEURALNETWORKS_OP_FAILED;
1669     }
1670     if (result != ANEURALNETWORKS_NO_ERROR) {
1671         LOG(ERROR) << operation.type << " failed.";
1672     }
1673 
1674     consumeOperationInputs(ins, operands);
1675     return result;
1676 #else
1677     LOG(ERROR) << "Built without CPU execution support";
1678     return ANEURALNETWORKS_OP_FAILED;
1679 #endif  // NN_INCLUDE_CPU_IMPLEMENTATION
1680 }
1681 
1682 // Copies RunTimeOperandInfo, preserving the original lifetime and numberOfUsesLeft
1683 // to prevent deallocation of subgraph inputs and outputs.
setInfoExceptLifetime(RunTimeOperandInfo * to,const RunTimeOperandInfo & from)1684 static void setInfoExceptLifetime(RunTimeOperandInfo* to, const RunTimeOperandInfo& from) {
1685     auto originalLifetime = to->lifetime;
1686     auto originalNumberOfUsesLeft = to->numberOfUsesLeft;
1687     *to = from;
1688     to->lifetime = originalLifetime;
1689     to->numberOfUsesLeft = originalNumberOfUsesLeft;
1690 }
1691 
executeIfOperation(const Operation & operation,RunTimeOperandInfo * operands)1692 int CpuExecutor::executeIfOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1693     namespace op = operation_if;
1694     const RunTimeOperandInfo& condOperand = operands[operation.inputs[op::kCondBoolOperand]];
1695     if (condOperand.buffer == nullptr) {
1696         LOG(ERROR) << "Cannot read IF condition operand value";
1697         return ANEURALNETWORKS_OP_FAILED;
1698     }
1699     const bool condValue = *reinterpret_cast<const bool8*>(condOperand.buffer);
1700     VLOG(CPUEXE) << "CpuExecutor::executeIfOperation: condition value: " << condValue;
1701 
1702     const uint32_t branchInputIndex = condValue ? op::kThenModelOperand : op::kElseModelOperand;
1703     const RunTimeOperandInfo& branchOperand = operands[operation.inputs[branchInputIndex]];
1704     const Model::Subgraph& branchSubgraph =
1705             *reinterpret_cast<const Model::Subgraph*>(branchOperand.buffer);
1706     std::vector<RunTimeOperandInfo> branchOperands = initializeRunTimeInfo(branchSubgraph);
1707 
1708     // Initialize inner input and output operands from outer operands.
1709     for (uint32_t i = 0, n = branchSubgraph.inputIndexes.size(); i < n; ++i) {
1710         setInfoExceptLifetime(&branchOperands[branchSubgraph.inputIndexes[i]],
1711                               operands[operation.inputs[op::kFirstInput + i]]);
1712     }
1713     for (uint32_t i = 0, n = branchSubgraph.outputIndexes.size(); i < n; ++i) {
1714         setInfoExceptLifetime(&branchOperands[branchSubgraph.outputIndexes[i]],
1715                               operands[operation.outputs[i]]);
1716     }
1717 
1718     NN_RETURN_IF_ERROR(executeSubgraph(branchSubgraph, branchOperands.data()));
1719     freeUnusedSubgraphOperands(&branchOperands);
1720 
1721     // Update outer outputs.
1722     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1723         setInfoExceptLifetime(&operands[operation.outputs[i]],
1724                               branchOperands[branchSubgraph.outputIndexes[i]]);
1725     }
1726 
1727     consumeOperationInputs(operation.inputs, operands);
1728     return ANEURALNETWORKS_NO_ERROR;
1729 }
1730 
executeWhileOperation(const Operation & operation,RunTimeOperandInfo * operands)1731 int CpuExecutor::executeWhileOperation(const Operation& operation, RunTimeOperandInfo* operands) {
1732     namespace op = operation_while;
1733     const RunTimeOperandInfo& condModelOperand = operands[operation.inputs[op::kCondModelOperand]];
1734     const RunTimeOperandInfo& bodyModelOperand = operands[operation.inputs[op::kBodyModelOperand]];
1735     const Model::Subgraph& condSubgraph =
1736             *reinterpret_cast<const Model::Subgraph*>(condModelOperand.buffer);
1737     const Model::Subgraph& bodySubgraph =
1738             *reinterpret_cast<const Model::Subgraph*>(bodyModelOperand.buffer);
1739     std::vector<RunTimeOperandInfo> condOperands = initializeRunTimeInfo(condSubgraph);
1740     std::vector<RunTimeOperandInfo> bodyOperands = initializeRunTimeInfo(bodySubgraph);
1741 
1742     // The code below implements the following sequence of subgraph input and output buffer
1743     // assignments:
1744     // iteration = 0   cond inputs = body inputs = outer inputs   body outputs = tmp1
1745     // iteration = 1   cond inputs = body inputs = tmp1           body outputs = tmp2
1746     // iteration = 2   cond inputs = body inputs = tmp2           body outputs = tmp1
1747     // iteration = 3   cond inputs = body inputs = ...            body outputs = ...
1748 
1749     // For body output double buffering.
1750     std::vector<uint8_t*> tmp1(bodySubgraph.outputIndexes.size());
1751     std::vector<uint8_t*> tmp2(bodySubgraph.outputIndexes.size());
1752 
1753     // Ensure objects are freed
1754     auto cleanupGuard = base::make_scope_guard(
1755             [&tmp1, &tmp2, &condOperands, &bodyOperands, &operation, &operands] {
1756                 auto freeLoopOutputs = [](const std::vector<uint8_t*>& tmp) {
1757                     for (auto buffer : tmp) {
1758                         if (buffer != nullptr) {
1759                             delete[] buffer;
1760                         }
1761                     }
1762                 };
1763 
1764                 freeLoopOutputs(tmp1);
1765                 freeLoopOutputs(tmp2);
1766                 freeUnusedSubgraphOperands(&condOperands);
1767                 freeUnusedSubgraphOperands(&bodyOperands);
1768                 consumeOperationInputs(operation.inputs, operands);
1769             });
1770 
1771     // For body outputs with unknown shape, we skip double buffering and
1772     // allocate on each iteration instead. This allows growing output tensors
1773     // inside a WHILE loop.
1774     std::vector<bool> bodyOutputHasUnknownShape(bodySubgraph.outputIndexes.size());
1775     for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1776         const Operand& operand = bodySubgraph.operands[bodySubgraph.outputIndexes[i]];
1777         bodyOutputHasUnknownShape[i] = nonExtensionOperandSizeOfData(operand) == 0;
1778     }
1779 
1780     // Initialize condition inputs from outer operands.
1781     for (uint32_t i = 0, n = condSubgraph.inputIndexes.size(); i < n; ++i) {
1782         setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1783                               operands[operation.inputs[op::kFirstInput + i]]);
1784     }
1785 
1786     // Store condition output on the stack.
1787     RunTimeOperandInfo& condOutput = condOperands[condSubgraph.outputIndexes[0]];
1788     bool8 condValue = {/* initialized memory */};
1789     condOutput.buffer = &condValue;
1790     condOutput.length = sizeof(condValue);
1791 
1792     std::chrono::nanoseconds timeoutDuration(mLoopTimeoutDuration);
1793     const auto startTime = Clock::now();
1794     for (uint32_t iteration = 0;; ++iteration) {
1795         VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: iteration " << iteration;
1796         if (iteration != 0) {
1797             // Set condition inputs from previous iteration outputs.
1798             for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1799                 setInfoExceptLifetime(&condOperands[condSubgraph.inputIndexes[i]],
1800                                       bodyOperands[bodySubgraph.outputIndexes[i]]);
1801             }
1802         }
1803         NN_RETURN_IF_ERROR(executeSubgraph(condSubgraph, condOperands.data()));
1804         VLOG(CPUEXE) << "CpuExecutor::executeWhileOperation: condition value: "
1805                      << static_cast<int>(condValue);
1806         if (!condValue) {
1807             break;
1808         }
1809 
1810         const auto duration = Clock::now() - startTime;
1811         if (duration > timeoutDuration) {
1812             LOG(ERROR) << "CpuExecutor::executeWhileOperation: timed out after "
1813                        << std::chrono::duration_cast<std::chrono::milliseconds>(duration).count()
1814                        << " ms";
1815             return ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT;
1816         }
1817 
1818         // Set body inputs from condition inputs.
1819         for (uint32_t i = 0, n = bodySubgraph.inputIndexes.size(); i < n; ++i) {
1820             bodyOperands[bodySubgraph.inputIndexes[i]] = condOperands[condSubgraph.inputIndexes[i]];
1821         }
1822         // Set body outputs.
1823         auto& outputBuffer = iteration % 2 == 0 ? tmp1 : tmp2;
1824         for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1825             RunTimeOperandInfo& info = bodyOperands[bodySubgraph.outputIndexes[i]];
1826             if (bodyOutputHasUnknownShape[i]) {
1827                 // Reset dimensions and buffer.
1828                 info.dimensions = bodySubgraph.operands[bodySubgraph.outputIndexes[i]].dimensions;
1829                 if (outputBuffer[i] != nullptr) {
1830                     delete[] outputBuffer[i];
1831                     outputBuffer[i] = nullptr;
1832                 }
1833             }
1834             info.buffer = outputBuffer[i];
1835         }
1836 
1837         NN_RETURN_IF_ERROR(executeSubgraph(bodySubgraph, bodyOperands.data()));
1838 
1839         // Update output buffer information in case we have allocated new buffers.
1840         for (uint32_t i = 0, n = bodySubgraph.outputIndexes.size(); i < n; ++i) {
1841             outputBuffer[i] = bodyOperands[bodySubgraph.outputIndexes[i]].buffer;
1842         }
1843     }
1844 
1845     // Copy body outputs to outer outputs.
1846     for (uint32_t i = 0, n = operation.outputs.size(); i < n; ++i) {
1847         RunTimeOperandInfo& outerOperand = operands[operation.outputs[i]];
1848         RunTimeOperandInfo& innerOperand = condOperands[condSubgraph.inputIndexes[i]];
1849         if (int error; !setInfoAndAllocateIfNeeded(&outerOperand, innerOperand.shape(), &error)) {
1850             return error;
1851         }
1852         CHECK_EQ(outerOperand.length, innerOperand.length);
1853         // TODO: Use the outer buffer as tmp1 to avoid copies.
1854         std::memcpy(outerOperand.buffer, innerOperand.buffer, innerOperand.length);
1855     }
1856 
1857     return ANEURALNETWORKS_NO_ERROR;
1858 }
1859 
setOutputShapes(const std::vector<uint32_t> & outputIndexes,const std::vector<RunTimeOperandInfo> & operands)1860 void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
1861                                   const std::vector<RunTimeOperandInfo>& operands) {
1862     mOutputShapes.resize(outputIndexes.size());
1863     for (uint32_t i = 0; i < outputIndexes.size(); i++) {
1864         const uint32_t operandIndex = outputIndexes[i];
1865         const RunTimeOperandInfo& from = operands[operandIndex];
1866         mOutputShapes[i].dimensions = from.dimensions;
1867         mOutputShapes[i].isSufficient = from.isSufficient();
1868         VLOG(EXECUTION) << "CpuExecutor::setOutputShapes: mOutputShapes[" << i
1869                         << "] = " << mOutputShapes[i];
1870     }
1871 }
1872 
1873 // b/109953668, disable OpenMP
1874 #ifdef NNAPI_OPENMP
ScopedOpenmpSettings()1875 ScopedOpenmpSettings::ScopedOpenmpSettings() {
1876     mBlocktimeInitial = kmp_get_blocktime();
1877     kmp_set_blocktime(20);  // ms, see b/109645291
1878 
1879 #if NNAPI_LIMIT_CPU_THREADS
1880     // Code not yet enabled. Choosing the number of threads to be based on
1881     // benchmarking. See longer comment by the class declaration.
1882     mMaxThreadsInitial = Eigen::nbThreads();
1883     const int nProcs = omp_get_num_procs();
1884     int threads = nProcs;
1885     if (nProcs >= 8) {
1886         threads = nProcs - 4;
1887     } else if (nProcs >= 4) {
1888         threads = nProcs - 2;
1889     }
1890     Eigen::setNbThreads(threads);
1891 #endif
1892 }
1893 
~ScopedOpenmpSettings()1894 ScopedOpenmpSettings::~ScopedOpenmpSettings() {
1895     kmp_set_blocktime(mBlocktimeInitial);
1896 #if NNAPI_LIMIT_CPU_THREADS
1897     Eigen::setNbThreads(mMaxThreadsInitial);
1898 #endif
1899 }
1900 #endif  // NNAPI_OPENMP
1901 
1902 }  // namespace nn
1903 }  // namespace android
1904