1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <algorithm>
16 #include <cmath>
17 #include <cstdint>
18 #include <cstring>
19 #include <limits>
20 #include <utility>
21
22 #include "fixedpoint/fixedpoint.h"
23 #include "tensorflow/lite/kernels/internal/common.h"
24 #include "tensorflow/lite/kernels/internal/compatibility.h"
25 #include "tensorflow/lite/kernels/internal/cppmath.h"
26 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
27
28 #if defined(_MSC_VER)
29 #define __restrict__ __restrict
30 #endif
31
32 namespace tflite {
33 namespace tensor_utils {
34
35 namespace {
36 const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
37 const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
38 } // namespace
39
PortableSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min_value,float * max_value,float * scaling_factor)40 void PortableSymmetricQuantizeFloats(const float* values, const int size,
41 int8_t* quantized_values, float* min_value,
42 float* max_value, float* scaling_factor) {
43 auto minmax = std::minmax_element(values, values + size);
44 *min_value = *minmax.first;
45 *max_value = *minmax.second;
46
47 PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
48 *max_value, scaling_factor);
49 }
50
PortableSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)51 void PortableSymmetricQuantizeFloats(const float* values, const int size,
52 int8_t* quantized_values, float min_value,
53 float max_value, float* scaling_factor) {
54 const int32_t kScale = 127;
55 const float range = std::max(std::abs(min_value), std::abs(max_value));
56 if (range == 0) {
57 memset(quantized_values, 0, size * sizeof(int8_t));
58 *scaling_factor = 1;
59 return;
60 }
61 *scaling_factor = range / kScale;
62 const float scaling_factor_inv = kScale / range;
63 for (int i = 0; i < size; ++i) {
64 const int32_t quantized_value =
65 static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
66 // Clamp: just in case some odd numeric offset.
67 quantized_values[i] = static_cast<int8_t>(
68 std::min(kScale, std::max(-kScale, quantized_value)));
69 }
70 }
71
PortableAsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)72 void PortableAsymmetricQuantizeFloats(const float* values, const int size,
73 int8_t* quantized_values,
74 float* scaling_factor, int32_t* offset) {
75 const int32_t kMinScale = -128;
76 const int32_t kMaxScale = 127;
77 const double qmin_double = kMinScale;
78 const double qmax_double = kMaxScale;
79 const auto minmax = std::minmax_element(values, values + size);
80 const double rmin = std::fmin(0, *minmax.first);
81 const double rmax = std::fmax(0, *minmax.second);
82 if (rmin == rmax) {
83 memset(quantized_values, 0, size * sizeof(int8_t));
84 *scaling_factor = 1;
85 *offset = 0;
86 return;
87 } else {
88 double scale = (rmax - rmin) / (qmax_double - qmin_double);
89 const double zero_point_from_min = qmin_double - rmin / scale;
90 const double zero_point_from_max = qmax_double - rmax / scale;
91 const double zero_point_from_min_error =
92 std::abs(qmin_double) + std::abs(rmin / scale);
93 const double zero_point_from_max_error =
94 std::abs(qmax_double) + std::abs(rmax / scale);
95 const double zero_point_double =
96 zero_point_from_min_error < zero_point_from_max_error
97 ? zero_point_from_min
98 : zero_point_from_max;
99 int8_t nudged_zero_point = 0;
100 if (zero_point_double <= qmin_double) {
101 nudged_zero_point = kMinScale;
102 } else if (zero_point_double >= qmax_double) {
103 nudged_zero_point = kMaxScale;
104 } else {
105 nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
106 }
107 *scaling_factor = scale;
108 *offset = nudged_zero_point;
109 }
110 const float scaling_factor_inv = 1.0 / *scaling_factor;
111 for (int i = 0; i < size; ++i) {
112 const int32_t quantized_value = static_cast<int32_t>(
113 TfLiteRound(*offset + values[i] * scaling_factor_inv));
114 quantized_values[i] =
115 std::min(kMaxScale, std::max(kMinScale, quantized_value));
116 }
117 }
118
PortableMatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)119 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
120 int m_rows, int m_cols,
121 const float* vector,
122 int n_batch, float* result) {
123 float* result_in_batch = result;
124 for (int b = 0; b < n_batch; b++) {
125 const float* matrix_ptr = matrix;
126 for (int r = 0; r < m_rows; r++) {
127 float dot_prod = 0.0f;
128 const float* vector_in_batch = vector + b * m_cols;
129 for (int c = 0; c < m_cols; c++) {
130 dot_prod += *matrix_ptr++ * *vector_in_batch++;
131 }
132 *result_in_batch += dot_prod;
133 ++result_in_batch;
134 }
135 }
136 }
137
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)138 void PortableMatrixBatchVectorMultiplyAccumulate(
139 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
140 const int8_t* __restrict__ vectors, const float* scaling_factors,
141 int n_batch, float* __restrict__ result) {
142 for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
143 const float batch_scaling_factor = scaling_factors[batch];
144 // Get the address of the first row.
145 const int8_t* row_ptr = matrix;
146 for (int row = 0; row < m_rows; ++row) {
147 // Initialize the dot product sum for the row to 0.
148 int32_t dotprod = 0;
149 #if defined(__GNUC__)
150 // Prefetch the row to cache.
151 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
152 3 /* temporal locality */);
153 #endif
154 for (int col = 0; col < m_cols; ++col, ++row_ptr) {
155 dotprod += (*row_ptr) * (vectors[col]);
156 } // for col
157 *result += dotprod * batch_scaling_factor;
158 ++result;
159 } // for row
160 } // for batch
161 }
162
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)163 void PortableMatrixBatchVectorMultiplyAccumulate(
164 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
165 const int8_t* __restrict__ vectors, const float* scaling_factors,
166 int n_batch, float* __restrict__ result, const float* per_channel_scale,
167 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
168 bool* compute_row_sums, CpuBackendContext* context) {
169 if (input_offset == nullptr) {
170 PortableMatrixBatchVectorMultiplyAccumulate(
171 matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
172 return;
173 }
174 if (!compute_row_sums || *compute_row_sums) {
175 PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
176 if (compute_row_sums) {
177 *compute_row_sums = false;
178 }
179 }
180
181 for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
182 const float batch_scaling_factor = scaling_factors[batch];
183 const int32_t batch_offset = input_offset[batch];
184 const int8_t* row_ptr = matrix;
185 for (int row = 0; row < m_rows; ++row) {
186 int32_t dotprod = 0;
187 float scale = batch_scaling_factor;
188 if (per_channel_scale) {
189 scale *= per_channel_scale[row];
190 }
191 #if defined(__GNUC__)
192 // Prefetch the row to cache.
193 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
194 3 /* temporal locality */);
195 #endif
196 for (int col = 0; col < m_cols; ++col, ++row_ptr) {
197 dotprod += (*row_ptr) * vectors[col];
198 } // for col
199 dotprod -= row_sums[row] * batch_offset;
200 *result += dotprod * scale;
201 ++result;
202 } // for row
203 } // for batch
204 }
205
PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)206 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
207 const float* __restrict__ matrix, const int32_t* __restrict__ segments,
208 const int32_t* __restrict__ indices, int m_rows, int m_cols,
209 const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
210 const int kBlockSize = 4;
211 TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
212 for (int batch = 0; batch < n_batch; batch++) {
213 const float* matrix_ptr = matrix;
214 for (int row = 0; row < m_rows; row++) {
215 float dot_prod = 0.0f;
216 const float* vector_in_batch = vector + batch * m_cols;
217 for (int i = segments[row]; i < segments[row + 1]; i++) {
218 const int block_start_index = indices[i] * kBlockSize;
219 const float* vector_block_in_batch_ptr =
220 vector_in_batch + block_start_index;
221 for (int c = 0; c < kBlockSize; c++) {
222 dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
223 }
224 }
225 result[batch * m_rows + row] += dot_prod;
226 }
227 }
228 }
229
PortableSparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)230 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
231 const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
232 int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
233 float* __restrict__ result) {
234 const int kBlockSize = 16;
235 TFLITE_DCHECK_EQ( // NOLINT
236 m_cols % kBlockSize, 0);
237 for (int batch = 0; batch < n_batch; batch++) {
238 const float* matrix_ptr = matrix;
239 const uint8_t* ledger_ptr = ledger;
240 for (int row = 0; row < m_rows; row++) {
241 float dot_prod = 0.0f;
242 int num_nonzero_blocks = *ledger_ptr++;
243 if (num_nonzero_blocks > 0) {
244 const float* vector_in_batch = vector + batch * m_cols;
245 for (int i = 0; i < num_nonzero_blocks; i++) {
246 const int block_start_index = *ledger_ptr++ * kBlockSize;
247 const float* vector_block_in_batch_ptr =
248 vector_in_batch + block_start_index;
249 for (int c = 0; c < kBlockSize; c++) {
250 dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
251 }
252 }
253 }
254 result[batch * m_rows + row] += dot_prod;
255 }
256 }
257 }
258
PortableSparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)259 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
260 const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
261 const int m_cols, const int8_t* __restrict__ vectors,
262 const float* scaling_factors, int n_batch, float* __restrict__ result) {
263 static const int kBlockSize = 16;
264 TFLITE_DCHECK_EQ( // NOLINT
265 m_cols % kBlockSize, 0);
266 for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
267 const float batch_scaling_factor = scaling_factors[batch];
268 const uint8_t* ledger_ptr = ledger;
269 // Get the address of the first row.
270 const int8_t* row_ptr = matrix;
271 for (int row = 0; row < m_rows; ++row) {
272 // Initialize the dot product sum for the row to 0.
273 int32_t dotprod = 0;
274 #if defined(__GNUC__)
275 // Prefetch the row to cache.
276 __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
277 3 /* temporal locality */);
278 #endif
279 int num_nonzero_blocks = *ledger_ptr++;
280 for (int i = 0; i < num_nonzero_blocks; i++) {
281 const int block_start_index = *ledger_ptr++ * kBlockSize;
282 const int8_t* vector_block_ptr = vectors + block_start_index;
283 for (int c = 0; c < kBlockSize; c++) {
284 dotprod += (*row_ptr++) * (*vector_block_ptr++);
285 } // for block
286 } // for num_nonzero_blocks
287 result[batch * m_rows + row] += dotprod * batch_scaling_factor;
288 } // for row
289 } // for batch
290 }
291
292 template <typename T>
PortableMatrixBatchVectorMultiplyAccumulateImpl(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,T * output)293 void PortableMatrixBatchVectorMultiplyAccumulateImpl(
294 const int8_t* input, const int32_t* bias,
295 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
296 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
297 T* output) {
298 const int16_t output_max = std::numeric_limits<T>::max();
299 const int16_t output_min = std::numeric_limits<T>::min();
300 for (int batch = 0; batch < n_batch; ++batch) {
301 for (int row = 0; row < n_output; ++row) {
302 int32_t acc = bias[row];
303 for (int col = 0; col < n_input; ++col) {
304 int8_t input_val = input[batch * n_input + col];
305 int8_t weights_val = input_to_gate_weights[row * n_input + col];
306 acc += input_val * weights_val;
307 }
308 acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
309 acc += output_zp;
310 acc += output[batch * n_output + row];
311 if (acc > output_max) {
312 acc = output_max;
313 }
314 if (acc < output_min) {
315 acc = output_min;
316 }
317 output[batch * n_output + row] = static_cast<T>(acc);
318 }
319 }
320 }
321
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)322 void PortableMatrixBatchVectorMultiplyAccumulate(
323 const int8_t* input, const int32_t* bias,
324 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
325 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
326 int32_t* scratch, int16_t* output, CpuBackendContext* context) {
327 PortableMatrixBatchVectorMultiplyAccumulateImpl(
328 input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
329 n_output, output_zp, output);
330 }
331
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)332 void PortableMatrixBatchVectorMultiplyAccumulate(
333 const int8_t* input, const int32_t* bias,
334 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
335 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
336 int32_t* scratch, int8_t* output, CpuBackendContext* context) {
337 PortableMatrixBatchVectorMultiplyAccumulateImpl(
338 input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
339 n_output, output_zp, output);
340 }
341
PortableMatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)342 void PortableMatrixBatchVectorMultiply(const int8_t* input,
343 int32_t input_zeropoint,
344 const int8_t* input_to_gate_weights,
345 int32_t input_to_gate_effective_scale_a,
346 int32_t input_to_gate_effective_scale_b,
347 int32_t n_batch, int32_t n_input,
348 int32_t n_cell, int8_t* gate_output,
349 int8_t gate_output_zp) {
350 const int32_t int8_max = std::numeric_limits<int8_t>::max();
351 const int32_t int8_min = std::numeric_limits<int8_t>::min();
352 for (int batch = 0; batch < n_batch; ++batch) {
353 for (int row = 0; row < n_cell; ++row) {
354 int32_t acc = 0;
355 for (int col = 0; col < n_input; ++col) {
356 int32_t input_val = input[batch * n_input + col];
357 int8_t weights_val = input_to_gate_weights[row * n_input + col];
358 acc += (input_val - input_zeropoint) * weights_val;
359 }
360 acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
361 input_to_gate_effective_scale_b);
362 acc += gate_output_zp;
363 if (acc > int8_max) {
364 acc = int8_max;
365 }
366 if (acc < int8_min) {
367 acc = int8_min;
368 }
369 gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
370 }
371 }
372 }
373
PortableMatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)374 void PortableMatrixBatchVectorMultiply(
375 const int16_t* hidden, const int8_t* hidden_to_output_weights,
376 int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
377 const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
378 int32_t n_output, int32_t output_zp, int8_t* proj_output) {
379 const int16_t int8_max = std::numeric_limits<int8_t>::max();
380 const int16_t int8_min = std::numeric_limits<int8_t>::min();
381 for (int batch = 0; batch < n_batch; ++batch) {
382 for (int row = 0; row < n_output; ++row) {
383 int64_t acc = gate_bias[row];
384 for (int col = 0; col < n_hidden; ++col) {
385 int16_t input_val = hidden[batch * n_hidden + col];
386 int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
387 int64_t curr = acc;
388 acc += input_val * weights_val;
389 if (input_val * weights_val > 0 && acc < curr) {
390 acc = std::numeric_limits<int32_t>::max();
391 }
392 if (input_val * weights_val < 0 && acc > curr) {
393 acc = std::numeric_limits<int32_t>::min();
394 }
395 }
396 acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
397 proj_effective_scale_b);
398 acc += output_zp;
399 if (acc > int8_max) {
400 acc = int8_max;
401 }
402 if (acc < int8_min) {
403 acc = int8_min;
404 }
405 proj_output[batch * n_output + row] = acc;
406 }
407 }
408 }
409
PortableApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)410 void PortableApplyLayerNorm(const int16_t* input,
411 const int16_t* layer_norm_weights,
412 const int32_t* bias, int32_t layer_norm_scale_a,
413 int32_t layer_norm_scale_b, int32_t variance_limit,
414 int n_batch, int n_input, int16_t* output) {
415 // The square of std::pow(2, 10), which is the extra factor that makes sure
416 // normalized values has enough resolution.
417 static const int kTwoToPower20 = 1 << 20;
418 for (int i = 0; i < n_batch; ++i) {
419 int64_t sum = 0;
420 int64_t sum_sq = 0;
421 for (int j = 0; j < n_input; ++j) {
422 const int32_t index = i * n_input + j;
423 int32_t val = static_cast<int32_t>(input[index]);
424 sum += val;
425 sum_sq += val * val;
426 }
427 int32_t mean =
428 static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
429 // TODO(b/173994730): Avoids overflow but only works for POT n_input.
430 int32_t temp = kTwoToPower20 / n_input;
431 int64_t variance =
432 sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
433 int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
434 if (variance2 < 1) {
435 variance2 = variance_limit;
436 }
437 int32_t stddev_inverse_a;
438 int stddev_inverse_b;
439 GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
440 &stddev_inverse_a, &stddev_inverse_b);
441
442 for (int j = 0; j < n_input; ++j) {
443 const int32_t index = i * n_input + j;
444 int32_t val = static_cast<int32_t>(input[index]);
445 int32_t shifted = 1024 * val - mean;
446 int32_t rescaled = MultiplyByQuantizedMultiplier(
447 shifted, stddev_inverse_a, stddev_inverse_b);
448 // TODO(jianlijianli): Saturate this.
449 int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
450 int32_t val4 =
451 static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
452 int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
453 layer_norm_scale_b + 12);
454 val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
455 output[index] = static_cast<int16_t>(val5);
456 }
457 }
458 }
459
PortableApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)460 void PortableApplyLayerNormFloat(const int16_t* input,
461 const int16_t* layer_norm_weights,
462 int32_t layer_norm_scale_a,
463 int32_t layer_norm_scale_b,
464 const int32_t* bias, int n_batch, int n_input,
465 int16_t* output) {
466 const int32_t int16_max = std::numeric_limits<int16_t>::max();
467 const int32_t int16_min = std::numeric_limits<int16_t>::min();
468 // This is to surpress a lint warning.
469 const double two = 2.0;
470 const float layer_norm_scale =
471 layer_norm_scale_a *
472 std::pow(two, static_cast<double>(layer_norm_scale_b - 31));
473 const float bias_scale = std::pow(two, -10) * layer_norm_scale;
474
475 for (int batch = 0; batch < n_batch; ++batch) {
476 float sum = 0.0f;
477 float sum_sq = 0.0f;
478 for (int i = 0; i < n_input; ++i) {
479 const int index = batch * n_input + i;
480 const float value = static_cast<float>(input[index]);
481 sum += value;
482 sum_sq += value * value;
483 }
484 const float mean = sum / n_input;
485 float stddev_inv = 0.0f;
486 const float variance = sum_sq / n_input - mean * mean;
487 if (variance == 0) {
488 stddev_inv = 1.0f / sqrt(1e-8);
489 } else {
490 stddev_inv = 1.0f / sqrt(variance);
491 }
492 for (int i = 0; i < n_input; ++i) {
493 const int index = batch * n_input + i;
494 const float normalized_value =
495 (static_cast<float>(input[index]) - mean) * stddev_inv;
496 const float weighted_normalized_value =
497 normalized_value * layer_norm_weights[i] * layer_norm_scale +
498 bias[i] * bias_scale;
499 const int32_t quant_output = static_cast<int32_t>(
500 std::round(weighted_normalized_value * std::pow(2, 12)));
501 output[index] = std::min(int16_max, std::max(int16_min, quant_output));
502 }
503 }
504 }
505
PortableMatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)506 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
507 int32_t scalar, int32_t n_row,
508 int32_t n_col, int32_t* output) {
509 for (int i = 0; i < n_row; ++i) {
510 int32_t row_sum = 0;
511 for (int j = 0; j < n_col; ++j) {
512 row_sum += *matrix++;
513 }
514 output[i] += row_sum * scalar;
515 }
516 }
517
PortableApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)518 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
519 int32_t n_input, int16_t* output) {
520 for (int batch = 0; batch < n_batch; ++batch) {
521 for (int c = 0; c < n_input; c++) {
522 using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
523 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
524 const int index = batch * n_input + c;
525 F3 sigmoid_input = F3::FromRaw(input[index]);
526 F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
527 output[index] = sigmoid_output.raw();
528 }
529 }
530 }
531
PortableApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)532 void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
533 int32_t n_input, int16_t* output) {
534 const int32_t int16_max = std::numeric_limits<int16_t>::max();
535 const int32_t int16_min = std::numeric_limits<int16_t>::min();
536 for (int batch = 0; batch < n_batch; ++batch) {
537 for (int i = 0; i < n_input; ++i) {
538 const int index = batch * n_input + i;
539 const float float_input = input[index] * std::pow(2, -12);
540 const float float_output = 1.0f / (1.0f + std::exp(-float_input));
541 const int32_t quant_output =
542 static_cast<int32_t>(float_output * std::pow(2, 15));
543 const int32_t quant_output_clamped =
544 std::min(int16_max, std::max(int16_min, quant_output));
545 output[index] = static_cast<int16_t>(quant_output_clamped);
546 }
547 }
548 }
549
550 template <int IntegerBits>
PortableApplyTanhImpl(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)551 void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
552 int32_t n_input, int16_t* output) {
553 using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
554 using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
555 for (int batch = 0; batch < n_batch; ++batch) {
556 for (int i = 0; i < n_input; ++i) {
557 const int index = batch * n_input + i;
558 FX tanh_input = FX::FromRaw(input[index]);
559 F0 tanh_output = gemmlowp::tanh(tanh_input);
560 output[index] = tanh_output.raw();
561 }
562 }
563 }
564
PortableApplyTanh(int32_t integer_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)565 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
566 int32_t n_batch, int32_t n_input, int16_t* output) {
567 assert(integer_bits <= 6);
568 #define DISPATCH_TANH(i) \
569 case i: \
570 PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
571 break;
572 switch (integer_bits) {
573 DISPATCH_TANH(0);
574 DISPATCH_TANH(1);
575 DISPATCH_TANH(2);
576 DISPATCH_TANH(3);
577 DISPATCH_TANH(4);
578 DISPATCH_TANH(5);
579 DISPATCH_TANH(6);
580 default:
581 return;
582 }
583 #undef DISPATCH_TANH
584 }
585
PortableApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)586 void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
587 int32_t n_input, int32_t integer_bits,
588 int16_t* output) {
589 const int32_t int16_max = std::numeric_limits<int16_t>::max();
590 const int32_t int16_min = std::numeric_limits<int16_t>::min();
591 const double two = 2.0;
592 for (int batch = 0; batch < n_batch; ++batch) {
593 for (int i = 0; i < n_input; ++i) {
594 const int index = batch * n_input + i;
595 const float float_input =
596 input[index] * std::pow(two, static_cast<double>(integer_bits));
597 const float float_output = std::tanh(float_input);
598 const int32_t quant_output =
599 static_cast<int32_t>(float_output * std::pow(2, 15));
600 const int32_t quant_output_clamped =
601 std::min(int16_max, std::max(int16_min, quant_output));
602 output[index] = static_cast<int16_t>(quant_output_clamped);
603 }
604 }
605 }
606
PortableCwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)607 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
608 int n_batch, int n_input, int shift, int16_t* output) {
609 for (int batch = 0; batch < n_batch; ++batch) {
610 for (int i = 0; i < n_input; ++i) {
611 const int index = batch * n_input + i;
612 const int16_t a = input_1[index];
613 const int16_t b = input_2[index];
614 const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
615 output[index] =
616 static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
617 }
618 }
619 }
620
PortableCwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t output_zp,int8_t * output)621 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
622 int32_t multiplier, int32_t shift, int32_t n_batch,
623 int32_t n_input, int32_t output_zp, int8_t* output) {
624 for (int batch = 0; batch < n_batch; ++batch) {
625 for (int i = 0; i < n_input; ++i) {
626 const int index = batch * n_input + i;
627 const int16_t a = input_1[index];
628 const int16_t b = input_2[index];
629 int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
630 value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
631 value -= output_zp;
632 value = std::min(std::max(static_cast<int32_t>(-128), value),
633 static_cast<int32_t>(127));
634
635 output[index] = static_cast<int8_t>(value);
636 }
637 }
638 }
639
PortableCwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)640 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
641 int n_batch, int n_input, int16_t* output) {
642 for (int batch = 0; batch < n_batch; ++batch) {
643 for (int i = 0; i < n_input; ++i) {
644 const int index = batch * n_input + i;
645 int32_t sum = input_1[index] + input_2[index];
646 const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
647 output[index] = static_cast<int16_t>(sum_clamped);
648 }
649 }
650 }
651
PortableVectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)652 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
653 int v_size) {
654 float result = 0.0;
655 for (int v = 0; v < v_size; v++) {
656 result += *vector1++ * *vector2++;
657 }
658 return result;
659 }
660
661 namespace {
VectorVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size)662 inline int32_t VectorVectorDotProduct(const int16_t* vector1,
663 const int16_t* vector2, int v_size) {
664 int32_t result = 0;
665 for (int v = 0; v < v_size; v++) {
666 result += *vector1++ * *vector2++;
667 }
668 return result;
669 }
670 } // namespace
671
PortableBatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)672 void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
673 const int16_t* vector2,
674 int v_size, int n_batch,
675 int32_t* result) {
676 for (int b = 0; b < n_batch; b++) {
677 result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
678 vector1 += v_size;
679 vector2 += v_size;
680 }
681 }
682
PortableVectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)683 void PortableVectorBatchVectorCwiseProductAccumulate(
684 const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
685 int32_t multiplier, int shift, int16_t* result) {
686 for (int b = 0; b < n_batch; b++) {
687 for (int v = 0; v < v_size; v++) {
688 int32_t prod = vector[v] * *batch_vector++;
689 prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
690 int32_t output = prod + *result;
691 output = std::max(std::min(static_cast<int32_t>(32767), output),
692 static_cast<int32_t>(-32768));
693 *result++ = output;
694 }
695 }
696 }
697
PortableSub1Vector(const float * vector,int v_size,float * result)698 void PortableSub1Vector(const float* vector, int v_size, float* result) {
699 for (int v = 0; v < v_size; v++) {
700 *result++ = 1.0f - *vector++;
701 }
702 }
703
PortableSub1Vector(const int16_t * vector,int v_size,int16_t * result)704 void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
705 static const int16_t kOne = 32767;
706 for (int v = 0; v < v_size; v++) {
707 *result++ = kOne - *vector++;
708 }
709 }
710
PortableVectorScalarMultiply(const int8_t * vector,const int v_size,const float scale,float * result)711 void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
712 const float scale, float* result) {
713 for (int v = 0; v < v_size; ++v) {
714 *result++ = scale * *vector++;
715 }
716 }
717
PortableMeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)718 void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
719 float* __restrict__ output_vector,
720 int v_size, int n_batch) {
721 for (int batch = 0; batch < n_batch; ++batch) {
722 float sum = 0.0f;
723 for (int i = 0; i < v_size; ++i) {
724 sum += input_vector[i];
725 }
726 const float mean = sum / v_size;
727 float sum_diff_sq = 0.0f;
728 for (int i = 0; i < v_size; ++i) {
729 const float diff = input_vector[i] - mean;
730 sum_diff_sq += diff * diff;
731 }
732 const float variance = sum_diff_sq / v_size;
733 constexpr float kNormalizationConstant = 1e-8f;
734 const float stddev_inv =
735 1.0f / std::sqrt(variance + kNormalizationConstant);
736 for (int i = 0; i < v_size; ++i) {
737 output_vector[i] = (input_vector[i] - mean) * stddev_inv;
738 }
739 input_vector += v_size;
740 output_vector += v_size;
741 }
742 }
743
PortableTwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)744 void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
745 const int8_t* recurrent, int8_t recurrent_zp,
746 int32_t input_effective_scale_a,
747 int32_t input_effective_scale_b,
748 int32_t recurrent_effective_scale_a,
749 int32_t recurrent_effective_scale_b,
750 int32_t n_batch, int32_t n_cell,
751 int16_t* output) {
752 const int32_t int16_max = std::numeric_limits<int16_t>::max();
753 const int32_t int16_min = std::numeric_limits<int16_t>::min();
754 for (int i = 0; i < n_batch * n_cell; ++i) {
755 int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
756 int32_t h =
757 static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
758 int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
759 input_effective_scale_b);
760 int32_t h_scaled = MultiplyByQuantizedMultiplier(
761 h, recurrent_effective_scale_a, recurrent_effective_scale_b);
762 int32_t y = h_scaled + x_scaled;
763 if (y > int16_max) {
764 y = int16_max;
765 }
766 if (y < int16_min) {
767 y = int16_min;
768 }
769 output[i] = static_cast<int16_t>(y);
770 }
771 }
772
773 } // namespace tensor_utils
774 } // namespace tflite
775