1 /**
2 * Copyright 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "run_tflite.h"
18
19 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
20 #include "tensorflow/lite/kernels/register.h"
21
22 #include <android/log.h>
23 #include <dlfcn.h>
24 #include <sys/time.h>
25 #include <cstdio>
26
27 #define LOG_TAG "NN_BENCHMARK"
28
29 #define FATAL(fmt, ...) \
30 do { \
31 __android_log_print(ANDROID_LOG_FATAL, LOG_TAG, fmt, ##__VA_ARGS__); \
32 assert(false); \
33 } while (0)
34
35 namespace {
36
currentTimeInUsec()37 long long currentTimeInUsec() {
38 timeval tv;
39 gettimeofday(&tv, NULL);
40 return ((tv.tv_sec * 1000000L) + tv.tv_usec);
41 }
42
43 // Workaround for build systems that make difficult to pick the correct NDK API
44 // level. NDK tracing methods are dynamically loaded from libandroid.so.
45 typedef void* (*fp_ATrace_beginSection)(const char* sectionName);
46 typedef void* (*fp_ATrace_endSection)();
47 struct TraceFunc {
48 fp_ATrace_beginSection ATrace_beginSection;
49 fp_ATrace_endSection ATrace_endSection;
50 };
setupTraceFunc()51 TraceFunc setupTraceFunc() {
52 void* lib = dlopen("libandroid.so", RTLD_NOW | RTLD_LOCAL);
53 if (lib == nullptr) {
54 FATAL("unable to open libandroid.so");
55 }
56 return {
57 reinterpret_cast<fp_ATrace_beginSection>(
58 dlsym(lib, "ATrace_beginSection")),
59 reinterpret_cast<fp_ATrace_endSection>(dlsym(lib, "ATrace_endSection"))};
60 }
61 static TraceFunc kTraceFunc{setupTraceFunc()};
62
63 } // namespace
64
create(const char * modelfile,bool use_nnapi,bool enable_intermediate_tensors_dump,const char * nnapi_device_name)65 BenchmarkModel* BenchmarkModel::create(const char* modelfile, bool use_nnapi,
66 bool enable_intermediate_tensors_dump,
67 const char* nnapi_device_name) {
68 BenchmarkModel* model = new BenchmarkModel();
69 if (!model->init(modelfile, use_nnapi, enable_intermediate_tensors_dump,
70 nnapi_device_name)) {
71 delete model;
72 return nullptr;
73 }
74 return model;
75 }
76
init(const char * modelfile,bool use_nnapi,bool enable_intermediate_tensors_dump,const char * nnapi_device_name)77 bool BenchmarkModel::init(const char* modelfile, bool use_nnapi,
78 bool enable_intermediate_tensors_dump,
79 const char* nnapi_device_name) {
80 __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "BenchmarkModel %s",
81 modelfile);
82
83 // Memory map the model. NOTE this needs lifetime greater than or equal
84 // to interpreter context.
85 mTfliteModel = tflite::FlatBufferModel::BuildFromFile(modelfile);
86 if (!mTfliteModel) {
87 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to load model %s",
88 modelfile);
89 return false;
90 }
91
92 tflite::ops::builtin::BuiltinOpResolver resolver;
93 tflite::InterpreterBuilder(*mTfliteModel, resolver)(&mTfliteInterpreter);
94 if (!mTfliteInterpreter) {
95 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
96 "Failed to create TFlite interpreter");
97 return false;
98 }
99
100 if (enable_intermediate_tensors_dump) {
101 // Make output of every op a model output. This way we will be able to
102 // fetch each intermediate tensor when running with delegates.
103 std::vector<int> outputs;
104 for (size_t node = 0; node < mTfliteInterpreter->nodes_size(); ++node) {
105 auto node_outputs =
106 mTfliteInterpreter->node_and_registration(node)->first.outputs;
107 outputs.insert(outputs.end(), node_outputs->data,
108 node_outputs->data + node_outputs->size);
109 }
110 mTfliteInterpreter->SetOutputs(outputs);
111 }
112
113 // Allow Fp16 precision for all models
114 mTfliteInterpreter->SetAllowFp16PrecisionForFp32(true);
115
116 if (use_nnapi) {
117 if (nnapi_device_name != nullptr) {
118 __android_log_print(ANDROID_LOG_INFO, LOG_TAG, "Running NNAPI on device %s",
119 nnapi_device_name);
120 }
121 if (mTfliteInterpreter->ModifyGraphWithDelegate(
122 tflite::NnApiDelegate(nnapi_device_name)) != kTfLiteOk) {
123 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
124 "Failed to initialize NNAPI Delegate");
125 return false;
126 }
127 }
128 return true;
129 }
130
BenchmarkModel()131 BenchmarkModel::BenchmarkModel() {}
~BenchmarkModel()132 BenchmarkModel::~BenchmarkModel() {}
133
setInput(const uint8_t * dataPtr,size_t length)134 bool BenchmarkModel::setInput(const uint8_t* dataPtr, size_t length) {
135 int input = mTfliteInterpreter->inputs()[0];
136 auto* input_tensor = mTfliteInterpreter->tensor(input);
137
138 switch (input_tensor->type) {
139 case kTfLiteFloat32:
140 case kTfLiteUInt8: {
141 void* raw = input_tensor->data.raw;
142 memcpy(raw, dataPtr, length);
143 break;
144 }
145 default:
146 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
147 "Input tensor type not supported");
148 return false;
149 }
150 return true;
151 }
saveInferenceOutput(InferenceResult * result,int output_index)152 void BenchmarkModel::saveInferenceOutput(InferenceResult* result,
153 int output_index) {
154 int output = mTfliteInterpreter->outputs()[output_index];
155 auto* output_tensor = mTfliteInterpreter->tensor(output);
156 auto& sink = result->inferenceOutputs[output_index];
157 sink.insert(sink.end(), output_tensor->data.uint8,
158 output_tensor->data.uint8 + output_tensor->bytes);
159 }
160
getOutputError(const uint8_t * expected_data,size_t length,InferenceResult * result,int output_index)161 void BenchmarkModel::getOutputError(const uint8_t* expected_data, size_t length,
162 InferenceResult* result, int output_index) {
163 int output = mTfliteInterpreter->outputs()[output_index];
164 auto* output_tensor = mTfliteInterpreter->tensor(output);
165 if (output_tensor->bytes != length) {
166 FATAL("Wrong size of output tensor, expected %zu, is %zu",
167 output_tensor->bytes, length);
168 }
169
170 size_t elements_count = 0;
171 float err_sum = 0.0;
172 float max_error = 0.0;
173 switch (output_tensor->type) {
174 case kTfLiteUInt8: {
175 uint8_t* output_raw = mTfliteInterpreter->typed_tensor<uint8_t>(output);
176 elements_count = output_tensor->bytes;
177 for (size_t i = 0; i < output_tensor->bytes; ++i) {
178 float err = ((float)output_raw[i]) - ((float)expected_data[i]);
179 if (err > max_error) max_error = err;
180 err_sum += err * err;
181 }
182 break;
183 }
184 case kTfLiteFloat32: {
185 const float* expected = reinterpret_cast<const float*>(expected_data);
186 float* output_raw = mTfliteInterpreter->typed_tensor<float>(output);
187 elements_count = output_tensor->bytes / sizeof(float);
188 for (size_t i = 0; i < output_tensor->bytes / sizeof(float); ++i) {
189 float err = output_raw[i] - expected[i];
190 if (err > max_error) max_error = err;
191 err_sum += err * err;
192 }
193 break;
194 }
195 default:
196 FATAL("Output sensor type %d not supported", output_tensor->type);
197 }
198 result->meanSquareErrors[output_index] = err_sum / elements_count;
199 result->maxSingleErrors[output_index] = max_error;
200 }
201
resizeInputTensors(std::vector<int> shape)202 bool BenchmarkModel::resizeInputTensors(std::vector<int> shape) {
203 // The benchmark only expects single input tensor, hardcoded as 0.
204 int input = mTfliteInterpreter->inputs()[0];
205 mTfliteInterpreter->ResizeInputTensor(input, shape);
206 if (mTfliteInterpreter->AllocateTensors() != kTfLiteOk) {
207 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
208 "Failed to allocate tensors!");
209 return false;
210 }
211 return true;
212 }
213
runInference()214 bool BenchmarkModel::runInference() {
215 auto status = mTfliteInterpreter->Invoke();
216 if (status != kTfLiteOk) {
217 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Failed to invoke: %d!",
218 (int)status);
219 return false;
220 }
221 return true;
222 }
223
resetStates()224 bool BenchmarkModel::resetStates() {
225 auto status = mTfliteInterpreter->ResetVariableTensors();
226 if (status != kTfLiteOk) {
227 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
228 "Failed to reset variable tensors: %d!", (int)status);
229 return false;
230 }
231 return true;
232 }
233
benchmark(const std::vector<InferenceInOutSequence> & inOutData,int seqInferencesMaxCount,float timeout,int flags,std::vector<InferenceResult> * results)234 bool BenchmarkModel::benchmark(
235 const std::vector<InferenceInOutSequence>& inOutData,
236 int seqInferencesMaxCount, float timeout, int flags,
237 std::vector<InferenceResult>* results) {
238 if (inOutData.empty()) {
239 FATAL("Input/output vector is empty");
240 }
241
242 float inferenceTotal = 0.0;
243 for (int seqInferenceIndex = 0; seqInferenceIndex < seqInferencesMaxCount;
244 ++seqInferenceIndex) {
245 resetStates();
246
247 const int inputOutputSequenceIndex = seqInferenceIndex % inOutData.size();
248 const InferenceInOutSequence& seq = inOutData[inputOutputSequenceIndex];
249 for (int i = 0; i < seq.size(); ++i) {
250 const InferenceInOut& data = seq[i];
251
252 // For NNAPI systrace usage documentation, see
253 // frameworks/ml/nn/common/include/Tracing.h.
254 kTraceFunc.ATrace_beginSection("[NN_LA_PE]BenchmarkModel::benchmark");
255 kTraceFunc.ATrace_beginSection("[NN_LA_PIO]BenchmarkModel::input");
256 if (data.input) {
257 setInput(data.input, data.input_size);
258 } else {
259 int input = mTfliteInterpreter->inputs()[0];
260 auto* input_tensor = mTfliteInterpreter->tensor(input);
261 if (!data.createInput((uint8_t*)input_tensor->data.raw,
262 input_tensor->bytes)) {
263 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
264 "Input creation %d failed", i);
265 return false;
266 }
267 }
268 kTraceFunc.ATrace_endSection();
269 long long startTime = currentTimeInUsec();
270 const bool success = runInference();
271 kTraceFunc.ATrace_endSection();
272 long long endTime = currentTimeInUsec();
273 if (!success) {
274 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
275 i);
276 return false;
277 }
278
279 float inferenceTime =
280 static_cast<float>(endTime - startTime) / 1000000.0f;
281 size_t outputsCount = mTfliteInterpreter->outputs().size();
282 InferenceResult result{
283 inferenceTime, {}, {}, {}, inputOutputSequenceIndex, i};
284 result.meanSquareErrors.resize(outputsCount);
285 result.maxSingleErrors.resize(outputsCount);
286 result.inferenceOutputs.resize(outputsCount);
287
288 if ((flags & FLAG_IGNORE_GOLDEN_OUTPUT) == 0) {
289 if (outputsCount != data.outputs.size()) {
290 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG,
291 "Golden/actual outputs (%zu/%zu) count mismatch",
292 data.outputs.size(), outputsCount);
293 return false;
294 }
295 for (int j = 0; j < outputsCount; ++j) {
296 getOutputError(data.outputs[j].ptr, data.outputs[j].size, &result, j);
297 }
298 }
299
300 if ((flags & FLAG_DISCARD_INFERENCE_OUTPUT) == 0) {
301 for (int j = 0; j < outputsCount; ++j) {
302 saveInferenceOutput(&result, j);
303 }
304 }
305 results->push_back(result);
306 inferenceTotal += inferenceTime;
307 }
308
309 // Timeout?
310 if (timeout > 0.001 && inferenceTotal > timeout) {
311 return true;
312 }
313 }
314 return true;
315 }
316
dumpAllLayers(const char * path,const std::vector<InferenceInOutSequence> & inOutData)317 bool BenchmarkModel::dumpAllLayers(
318 const char* path, const std::vector<InferenceInOutSequence>& inOutData) {
319 if (inOutData.empty()) {
320 FATAL("Input/output vector is empty");
321 }
322
323 for (int seqInferenceIndex = 0; seqInferenceIndex < inOutData.size();
324 ++seqInferenceIndex) {
325 resetStates();
326
327 const InferenceInOutSequence& seq = inOutData[seqInferenceIndex];
328 for (int i = 0; i < seq.size(); ++i) {
329 const InferenceInOut& data = seq[i];
330 setInput(data.input, data.input_size);
331 const bool success = runInference();
332 if (!success) {
333 __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, "Inference %d failed",
334 i);
335 return false;
336 }
337
338 for (int tensor = 0; tensor < mTfliteInterpreter->tensors_size();
339 ++tensor) {
340 auto* output_tensor = mTfliteInterpreter->tensor(tensor);
341 if (output_tensor->data.raw == nullptr) {
342 continue;
343 }
344 char fullpath[1024];
345 snprintf(fullpath, 1024, "%s/dump_%.3d_seq_%.3d_tensor_%.3d", path,
346 seqInferenceIndex, i, tensor);
347 FILE* f = fopen(fullpath, "wb");
348 fwrite(output_tensor->data.raw, output_tensor->bytes, 1, f);
349 fclose(f);
350 }
351 }
352 }
353 return true;
354 }
355