/** * Copyright 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "run_tflite.h" #include #include #include #include #include #include #include #include #include #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h" #include "tensorflow/lite/nnapi/NeuralNetworksTypes.h" #include "tensorflow/lite/kernels/register.h" #define LOG_TAG "NN_BENCHMARK" #define FATAL(fmt, ...) \ do { \ __android_log_print(ANDROID_LOG_FATAL, LOG_TAG, fmt, ##__VA_ARGS__); \ assert(false); \ } while (0) namespace { long long currentTimeInUsec() { timeval tv; gettimeofday(&tv, NULL); return ((tv.tv_sec * 1000000L) + tv.tv_usec); } // Workaround for build systems that make difficult to pick the correct NDK API // level. NDK tracing methods are dynamically loaded from libandroid.so. typedef void* (*fp_ATrace_beginSection)(const char* sectionName); typedef void* (*fp_ATrace_endSection)(); struct TraceFunc { fp_ATrace_beginSection ATrace_beginSection; fp_ATrace_endSection ATrace_endSection; }; TraceFunc setupTraceFunc() { void* lib = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL); if (lib == nullptr) { FATAL("unable to open libandroid.so"); } return { reinterpret_cast( dlsym(lib, "ATrace_beginSection")), reinterpret_cast(dlsym(lib, "ATrace_endSection"))}; } static TraceFunc kTraceFunc{setupTraceFunc()}; } // namespace BenchmarkModel* BenchmarkModel::create(const char* modelfile, int tfliteBackend, bool enable_intermediate_tensors_dump, int* nnapiErrno, const char* nnapi_device_name, bool mmapModel, const char* nnapi_cache_dir) { BenchmarkModel* model = new BenchmarkModel(); if (!model->init(modelfile, tfliteBackend, enable_intermediate_tensors_dump, nnapiErrno, nnapi_device_name, mmapModel, nnapi_cache_dir)) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to init model %s", modelfile); delete model; return nullptr; } return model; } bool BenchmarkModel::init(const char* modelfile, int tfliteBackend, bool enable_intermediate_tensors_dump, int* nnapiErrno, const char* nnapi_device_name, bool mmapModel, const char* nnapi_cache_dir) { __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "BenchmarkModel %s", modelfile); mModelFile = modelfile; if (nnapi_cache_dir) { mCacheDir = nnapi_cache_dir; } if (nnapi_device_name) { mNnApiDeviceName = nnapi_device_name; } if (mmapModel) { // Memory map the model. NOTE this needs lifetime greater than or equal // to interpreter context. mTfliteModel = tflite::FlatBufferModel::BuildFromFile(modelfile); } else { std::ifstream t(modelfile); mModelBuffer = std::string((std::istreambuf_iterator(t)), std::istreambuf_iterator()); mTfliteModel = tflite::FlatBufferModel::BuildFromBuffer(mModelBuffer.c_str(), mModelBuffer.size()); } if (!mTfliteModel) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to load model %s", modelfile); return false; } tflite::ops::builtin::BuiltinOpResolver resolver; tflite::InterpreterBuilder(*mTfliteModel, resolver)(&mTfliteInterpreter); if (!mTfliteInterpreter) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to create TFlite interpreter"); return false; } if (enable_intermediate_tensors_dump) { // Make output of every op a model output. This way we will be able to // fetch each intermediate tensor when running with delegates. outputs.clear(); for (size_t node = 0; node < mTfliteInterpreter->nodes_size(); ++node) { auto node_outputs = mTfliteInterpreter->node_and_registration(node)->first.outputs; outputs.insert(outputs.end(), node_outputs->data, node_outputs->data + node_outputs->size); } mTfliteInterpreter->SetOutputs(outputs); } // Allow Fp16 precision for all models mTfliteInterpreter->SetAllowFp16PrecisionForFp32(true); mTfliteBackend = tfliteBackend; switch (mTfliteBackend) { case TFLITE_NNAPI: { tflite::StatefulNnApiDelegate::Options nnapi_options; nnapi_options.accelerator_name = nnapi_device_name; mTfliteNnapiDelegate = std::make_unique(nnapi_options); int delegationStatus = mTfliteInterpreter->ModifyGraphWithDelegate(mTfliteNnapiDelegate.get()); *nnapiErrno = mTfliteNnapiDelegate->GetNnApiErrno(); if (delegationStatus != kTfLiteOk || *nnapiErrno != ANEURALNETWORKS_NO_ERROR) { __android_log_print( ANDROID_LOG_ERROR, LOG_TAG, "Failed to initialize NNAPI Delegate for model %s, nnapi_errno is %d", modelfile, *nnapiErrno); return false; } } break; case TFLITE_GPU: { #if defined(NN_BENCHMARK_ENABLE_GPU) mGpuDelegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr); if (mTfliteInterpreter->ModifyGraphWithDelegate(mGpuDelegate) != kTfLiteOk) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to initialize GPU Delegate"); return false; } #else // !defined(NN_BENCHMARK_ENABLE_GPU) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "GPU delegate requested but not enabled with " "NN_BENCHMARK_ENABLE_GPU"); return false; #endif // defined(NN_BENCHMARK_ENABLE_GPU) } break; default: break; } return true; } BenchmarkModel::~BenchmarkModel() { switch (mTfliteBackend) { case TFLITE_GPU: { #if defined(NN_BENCHMARK_ENABLE_GPU) // !defined(NN_BENCHMARK_ENABLE_GPU) TfLiteGpuDelegateV2Delete(mGpuDelegate); #endif // !defined(NN_BENCHMARK_ENABLE_GPU) } break; default: break; } } bool BenchmarkModel::setInput(const uint8_t* dataPtr, size_t length) { int input = mTfliteInterpreter->inputs()[0]; auto* input_tensor = mTfliteInterpreter->tensor(input); switch (input_tensor->type) { case kTfLiteFloat32: case kTfLiteUInt8: { void* raw = input_tensor->data.raw; memcpy(raw, dataPtr, length); break; } default: __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Input tensor type not supported"); return false; } return true; } void BenchmarkModel::saveInferenceOutput(InferenceResult* result, int output_index) { int output = mTfliteInterpreter->outputs()[output_index]; auto* output_tensor = mTfliteInterpreter->tensor(output); auto& sink = result->inferenceOutputs[output_index]; sink.insert(sink.end(), output_tensor->data.uint8, output_tensor->data.uint8 + output_tensor->bytes); } void BenchmarkModel::getOutputError(const uint8_t* expected_data, size_t length, InferenceResult* result, int output_index) { int output = mTfliteInterpreter->outputs()[output_index]; auto* output_tensor = mTfliteInterpreter->tensor(output); if (output_tensor->bytes != length) { FATAL("Wrong size of output tensor, expected %zu, is %zu", output_tensor->bytes, length); } size_t elements_count = 0; float err_sum = 0.0; float max_error = 0.0; switch (output_tensor->type) { case kTfLiteUInt8: { uint8_t* output_raw = mTfliteInterpreter->typed_tensor(output); elements_count = output_tensor->bytes; for (size_t i = 0; i < output_tensor->bytes; ++i) { float err = ((float)output_raw[i]) - ((float)expected_data[i]); if (err > max_error) max_error = err; err_sum += err * err; } break; } case kTfLiteFloat32: { const float* expected = reinterpret_cast(expected_data); float* output_raw = mTfliteInterpreter->typed_tensor(output); elements_count = output_tensor->bytes / sizeof(float); for (size_t i = 0; i < output_tensor->bytes / sizeof(float); ++i) { float err = output_raw[i] - expected[i]; if (err > max_error) max_error = err; err_sum += err * err; } break; } default: FATAL("Output sensor type %d not supported", output_tensor->type); } result->meanSquareErrors[output_index] = err_sum / elements_count; result->maxSingleErrors[output_index] = max_error; } bool BenchmarkModel::resizeInputTensors(std::vector shape) { // The benchmark only expects single input tensor, hardcoded as 0. int input = mTfliteInterpreter->inputs()[0]; mTfliteInterpreter->ResizeInputTensor(input, shape); if (mTfliteInterpreter->AllocateTensors() != kTfLiteOk) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to allocate tensors!"); return false; } return true; } bool BenchmarkModel::runInference() { auto status = mTfliteInterpreter->Invoke(); auto nnapi_errno = mTfliteNnapiDelegate ? mTfliteNnapiDelegate->GetNnApiErrno() : ANEURALNETWORKS_NO_ERROR; if (status != kTfLiteOk || nnapi_errno != ANEURALNETWORKS_NO_ERROR) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to invoke, tflite status: %d, nnapi errno: %d!", (int)status, nnapi_errno); return false; } return true; } bool BenchmarkModel::resetStates() { auto status = mTfliteInterpreter->ResetVariableTensors(); if (status != kTfLiteOk) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to reset variable tensors: %d!", (int)status); return false; } return true; } bool BenchmarkModel::benchmark( const std::vector& inOutData, int seqInferencesMaxCount, float timeout, int flags, std::vector* results) { if (inOutData.empty()) { __android_log_print(ANDROID_LOG_WARN, LOG_TAG, "Input/output vector is empty"); return true; } float inferenceTotal = 0.0; for (int seqInferenceIndex = 0; seqInferenceIndex < seqInferencesMaxCount; ++seqInferenceIndex) { resetStates(); const int inputOutputSequenceIndex = seqInferenceIndex % inOutData.size(); const InferenceInOutSequence& seq = inOutData[inputOutputSequenceIndex]; const bool sampleResults = (flags & FLAG_SAMPLE_BENCHMARK_RESULTS) != 0; for (int i = 0; i < seq.size(); ++i) { const InferenceInOut& data = seq[i]; // For NNAPI systrace usage documentation, see // frameworks/ml/nn/common/include/Tracing.h. kTraceFunc.ATrace_beginSection("[NN_LA_PE]BenchmarkModel::benchmark"); kTraceFunc.ATrace_beginSection("[NN_LA_PIO]BenchmarkModel::input"); if (data.input) { setInput(data.input, data.input_size); } else { int input = mTfliteInterpreter->inputs()[0]; auto* input_tensor = mTfliteInterpreter->tensor(input); if (!data.createInput((uint8_t*)input_tensor->data.raw, input_tensor->bytes)) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Input creation %d failed", i); return false; } } kTraceFunc.ATrace_endSection(); long long startTime = currentTimeInUsec(); const bool success = runInference(); kTraceFunc.ATrace_endSection(); long long endTime = currentTimeInUsec(); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed", i); return false; } float inferenceTime = static_cast(endTime - startTime) / 1000000.0f; size_t outputsCount = mTfliteInterpreter->outputs().size(); InferenceResult result{ inferenceTime, {}, {}, {}, inputOutputSequenceIndex, i}; result.meanSquareErrors.resize(outputsCount); result.maxSingleErrors.resize(outputsCount); result.inferenceOutputs.resize(outputsCount); if ((flags & FLAG_IGNORE_GOLDEN_OUTPUT) == 0) { if (outputsCount != data.outputs.size()) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Golden/actual outputs (%zu/%zu) count mismatch", data.outputs.size(), outputsCount); return false; } for (int j = 0; j < outputsCount; ++j) { getOutputError(data.outputs[j].ptr, data.outputs[j].size, &result, j); } } if ((flags & FLAG_DISCARD_INFERENCE_OUTPUT) == 0) { for (int j = 0; j < outputsCount; ++j) { saveInferenceOutput(&result, j); } } if (!sampleResults || (seqInferenceIndex % INFERENCE_OUT_SAMPLE_RATE) == 0) { results->push_back(result); } inferenceTotal += inferenceTime; } // Timeout? if (timeout > 0.001 && inferenceTotal > timeout) { return true; } } return true; } // If cacheDir is not nullptr, compilation caching will be used with NNAPI. bool BenchmarkModel::runCompilation(const char* cacheDir) { std::unique_ptr interpreter; tflite::ops::builtin::BuiltinOpResolver resolver; tflite::InterpreterBuilder(*mTfliteModel, resolver)(&interpreter); if (!interpreter) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to create TFlite interpreter"); return false; } // Allow Fp16 precision for all models interpreter->SetAllowFp16PrecisionForFp32(true); if (mTfliteBackend == TFLITE_NNAPI) { tflite::StatefulNnApiDelegate::Options nnapi_options; nnapi_options.accelerator_name = mNnApiDeviceName.empty() ? nullptr : mNnApiDeviceName.c_str(); if (cacheDir) { nnapi_options.cache_dir = cacheDir; nnapi_options.model_token = mModelFile.c_str(); } tflite::StatefulNnApiDelegate delegate(nnapi_options); int delegationStatus = interpreter->ModifyGraphWithDelegate(&delegate); auto nnapiErrno = delegate.GetNnApiErrno(); if (delegationStatus != kTfLiteOk || nnapiErrno != ANEURALNETWORKS_NO_ERROR) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to initialize NNAPI Delegate for model %s, nnapi_errno is %d", mModelFile.c_str(), nnapiErrno); return false; } } return true; } // A helper class to manage the lifetime of a temporary cache directory. class ScopedTempDirectory { public: ScopedTempDirectory(std::string base) : mBase(std::move(base)) {} ~ScopedTempDirectory() { cleanup(); } // Create a new temp directory, remove the old one if needed. void recreate() { cleanup(); mTempDir = mBase + "/XXXXXX"; mkdtemp(&mTempDir[0]); } // Get the path to the temp directory. const char* get() const { return mTempDir.empty() ? nullptr : mTempDir.c_str(); } private: void cleanup() { if (mTempDir.empty()) { return; } auto callback = [](const char* entry, const struct stat*, int, struct FTW*) { return remove(entry); }; nftw(mTempDir.c_str(), callback, 128, FTW_DEPTH | FTW_MOUNT | FTW_PHYS); mTempDir.clear(); } std::string mBase; std::string mTempDir; }; bool BenchmarkModel::getCompilationCacheSize(int* cacheSizeBytes) { if (cacheSizeBytes == nullptr) return false; // Create cache files. ScopedTempDirectory tempDir(mCacheDir.value()); tempDir.recreate(); const bool success = runCompilation(tempDir.get()); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Save to cache failed"); return false; } // Compute total size of cache files. int totalSize = 0; DIR* dir = opendir(tempDir.get()); if (dir == nullptr) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to open cache directory"); return false; } struct dirent* dp = nullptr; while ((dp = readdir(dir)) != nullptr) { char fullPath[1024]; snprintf(fullPath, 1024, "%s/%s", tempDir.get(), dp->d_name); struct stat st; int err = stat(fullPath, &st); if (err != 0) { closedir(dir); __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to stat %s", fullPath); return false; } // Only accumulate sizes of regular files. This will exclude '.' and '..'. if (S_ISREG(st.st_mode)) { totalSize += st.st_size; } } closedir(dir); *cacheSizeBytes = totalSize; return true; } bool BenchmarkModel::benchmarkSingleTypeOfCompilation(CompilationBenchmarkType type, int maxNumIterations, float timeout, std::vector* results) { if (results != nullptr) { results->clear(); } ScopedTempDirectory tempDir(mCacheDir.value()); // Initialize cache files to benchmark cache hit. if (type == CompilationBenchmarkType::PREPARE_FROM_CACHE) { tempDir.recreate(); const bool success = runCompilation(tempDir.get()); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Save to cache failed"); return false; } } float compilationTotal = 0.0; for (int i = 0; i < maxNumIterations; i++) { const char* cacheDir = nullptr; switch (type) { case CompilationBenchmarkType::WITHOUT_CACHE: cacheDir = nullptr; break; case CompilationBenchmarkType::SAVE_TO_CACHE: // Remove the cache files from the last iteration to benchmark cache miss. tempDir.recreate(); [[fallthrough]]; case CompilationBenchmarkType::PREPARE_FROM_CACHE: cacheDir = tempDir.get(); break; default: __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Unknown CompilationBenchmarkType: %d", static_cast(type)); return false; } kTraceFunc.ATrace_beginSection("[NN_LA_PC]BenchmarkModel::benchmarkCompilation"); const long long startTime = currentTimeInUsec(); const bool success = runCompilation(cacheDir); const long long endTime = currentTimeInUsec(); kTraceFunc.ATrace_endSection(); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Compilation %d failed", i); return false; } const float compilationTime = static_cast(endTime - startTime) / 1000000.0f; if (results != nullptr) { results->push_back(compilationTime); } // Timeout? compilationTotal += compilationTime; if (timeout > 0.001 && compilationTotal > timeout) { return true; } } return true; } bool BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup(CompilationBenchmarkType type, int maxNumIterations, float warmupTimeout, float runTimeout, std::vector* results) { kTraceFunc.ATrace_beginSection( "[NN_LA_PWM]BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup"); bool success = benchmarkSingleTypeOfCompilation(type, maxNumIterations, warmupTimeout, nullptr); kTraceFunc.ATrace_endSection(); if (!success) return false; kTraceFunc.ATrace_beginSection( "[NN_LA_PBM]BenchmarkModel::benchmarkSingleTypeOfCompilationWithWarmup"); success = benchmarkSingleTypeOfCompilation(type, maxNumIterations, runTimeout, results); kTraceFunc.ATrace_endSection(); return success; } bool BenchmarkModel::benchmarkCompilation(int maxNumIterations, float warmupTimeout, float runTimeout, CompilationBenchmarkResult* result) { if (result == nullptr) return false; // Benchmark compile without cache. bool success = benchmarkSingleTypeOfCompilationWithWarmup( CompilationBenchmarkType::WITHOUT_CACHE, maxNumIterations, warmupTimeout, runTimeout, &result->compileWithoutCacheTimeSec); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark compilation without cache"); return false; } // Get compilation cache size. success = getCompilationCacheSize(&result->cacheSizeBytes); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to retrieve compilation cache size"); return false; } // Benchmark saving to cache and preparing from cache only if supported. if (result->cacheSizeBytes > 0) { // Benchmark saving to cache. auto& saveToCacheTimeSec = result->saveToCacheTimeSec.emplace(); success = benchmarkSingleTypeOfCompilationWithWarmup( CompilationBenchmarkType::SAVE_TO_CACHE, maxNumIterations, warmupTimeout, runTimeout, &saveToCacheTimeSec); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark saving to cache"); return false; } // Benchmark preparing from cache. auto& prepareFromCacheTimeSec = result->prepareFromCacheTimeSec.emplace(); success = benchmarkSingleTypeOfCompilationWithWarmup( CompilationBenchmarkType::PREPARE_FROM_CACHE, maxNumIterations, warmupTimeout, runTimeout, &prepareFromCacheTimeSec); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to benchmark preparing from cache"); return false; } } return result; } bool BenchmarkModel::dumpAllLayers( const char* path, const std::vector& inOutData) { if (inOutData.empty()) { FATAL("Input/output vector is empty"); } for (int seqInferenceIndex = 0; seqInferenceIndex < inOutData.size(); ++seqInferenceIndex) { resetStates(); const InferenceInOutSequence& seq = inOutData[seqInferenceIndex]; for (int i = 0; i < seq.size(); ++i) { const InferenceInOut& data = seq[i]; setInput(data.input, data.input_size); const bool success = runInference(); if (!success) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed", i); return false; } // The order of the tensor is not sorted by the tensor index for (int tensor_order = 0; tensor_order < outputs.size(); ++tensor_order) { int tensor_index = outputs[tensor_order]; auto* output_tensor = mTfliteInterpreter->tensor(tensor_index); if (output_tensor->data.raw == nullptr) { __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "output_tensor->data.raw == nullptr at index %d ", tensor_index); continue; } char fullpath[1024]; snprintf(fullpath, 1024, "%s/dump_%.3d_seq_%.3d_order_%.3d_tensor_%.3d", path, seqInferenceIndex, i, tensor_order, tensor_index); FILE* f = fopen(fullpath, "wb"); fwrite(output_tensor->data.raw, output_tensor->bytes, 1, f); fclose(f); } } } return true; }