• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include <algorithm>
16 #include <cmath>
17 #include <cstdint>
18 #include <cstring>
19 #include <limits>
20 #include <utility>
21 
22 #include "fixedpoint/fixedpoint.h"
23 #include "tensorflow/lite/kernels/internal/common.h"
24 #include "tensorflow/lite/kernels/internal/compatibility.h"
25 #include "tensorflow/lite/kernels/internal/cppmath.h"
26 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
27 
28 #if defined(_MSC_VER)
29 #define __restrict__ __restrict
30 #endif
31 
32 namespace tflite {
33 namespace tensor_utils {
34 
35 namespace {
36 const int32_t kInt16Max = std::numeric_limits<int16_t>::max();
37 const int32_t kInt16Min = std::numeric_limits<int16_t>::min();
38 }  // namespace
39 
PortableSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min_value,float * max_value,float * scaling_factor)40 void PortableSymmetricQuantizeFloats(const float* values, const int size,
41                                      int8_t* quantized_values, float* min_value,
42                                      float* max_value, float* scaling_factor) {
43   auto minmax = std::minmax_element(values, values + size);
44   *min_value = *minmax.first;
45   *max_value = *minmax.second;
46 
47   PortableSymmetricQuantizeFloats(values, size, quantized_values, *min_value,
48                                   *max_value, scaling_factor);
49 }
50 
PortableSymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)51 void PortableSymmetricQuantizeFloats(const float* values, const int size,
52                                      int8_t* quantized_values, float min_value,
53                                      float max_value, float* scaling_factor) {
54   const int32_t kScale = 127;
55   const float range = std::max(std::abs(min_value), std::abs(max_value));
56   if (range == 0) {
57     memset(quantized_values, 0, size * sizeof(int8_t));
58     *scaling_factor = 1;
59     return;
60   }
61   *scaling_factor = range / kScale;
62   const float scaling_factor_inv = kScale / range;
63   for (int i = 0; i < size; ++i) {
64     const int32_t quantized_value =
65         static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
66     // Clamp: just in case some odd numeric offset.
67     quantized_values[i] = static_cast<int8_t>(
68         std::min(kScale, std::max(-kScale, quantized_value)));
69   }
70 }
71 
PortableAsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)72 void PortableAsymmetricQuantizeFloats(const float* values, const int size,
73                                       int8_t* quantized_values,
74                                       float* scaling_factor, int32_t* offset) {
75   const int32_t kMinScale = -128;
76   const int32_t kMaxScale = 127;
77   const double qmin_double = kMinScale;
78   const double qmax_double = kMaxScale;
79   const auto minmax = std::minmax_element(values, values + size);
80   const double rmin = std::fmin(0, *minmax.first);
81   const double rmax = std::fmax(0, *minmax.second);
82   if (rmin == rmax) {
83     memset(quantized_values, 0, size * sizeof(int8_t));
84     *scaling_factor = 1;
85     *offset = 0;
86     return;
87   } else {
88     double scale = (rmax - rmin) / (qmax_double - qmin_double);
89     const double zero_point_from_min = qmin_double - rmin / scale;
90     const double zero_point_from_max = qmax_double - rmax / scale;
91     const double zero_point_from_min_error =
92         std::abs(qmin_double) + std::abs(rmin / scale);
93     const double zero_point_from_max_error =
94         std::abs(qmax_double) + std::abs(rmax / scale);
95     const double zero_point_double =
96         zero_point_from_min_error < zero_point_from_max_error
97             ? zero_point_from_min
98             : zero_point_from_max;
99     int8_t nudged_zero_point = 0;
100     if (zero_point_double <= qmin_double) {
101       nudged_zero_point = kMinScale;
102     } else if (zero_point_double >= qmax_double) {
103       nudged_zero_point = kMaxScale;
104     } else {
105       nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
106     }
107     *scaling_factor = scale;
108     *offset = nudged_zero_point;
109   }
110   const float scaling_factor_inv = 1.0 / *scaling_factor;
111   for (int i = 0; i < size; ++i) {
112     const int32_t quantized_value = static_cast<int32_t>(
113         TfLiteRound(*offset + values[i] * scaling_factor_inv));
114     quantized_values[i] =
115         std::min(kMaxScale, std::max(kMinScale, quantized_value));
116   }
117 }
118 
PortableMatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)119 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
120                                                  int m_rows, int m_cols,
121                                                  const float* vector,
122                                                  int n_batch, float* result) {
123   float* result_in_batch = result;
124   for (int b = 0; b < n_batch; b++) {
125     const float* matrix_ptr = matrix;
126     for (int r = 0; r < m_rows; r++) {
127       float dot_prod = 0.0f;
128       const float* vector_in_batch = vector + b * m_cols;
129       for (int c = 0; c < m_cols; c++) {
130         dot_prod += *matrix_ptr++ * *vector_in_batch++;
131       }
132       *result_in_batch += dot_prod;
133       ++result_in_batch;
134     }
135   }
136 }
137 
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)138 void PortableMatrixBatchVectorMultiplyAccumulate(
139     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
140     const int8_t* __restrict__ vectors, const float* scaling_factors,
141     int n_batch, float* __restrict__ result) {
142   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
143     const float batch_scaling_factor = scaling_factors[batch];
144     // Get the address of the first row.
145     const int8_t* row_ptr = matrix;
146     for (int row = 0; row < m_rows; ++row) {
147       // Initialize the dot product sum for the row to 0.
148       int32_t dotprod = 0;
149 #if defined(__GNUC__)
150       // Prefetch the row to cache.
151       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
152                          3 /* temporal locality */);
153 #endif
154       for (int col = 0; col < m_cols; ++col, ++row_ptr) {
155         dotprod += (*row_ptr) * (vectors[col]);
156       }  // for col
157       *result += dotprod * batch_scaling_factor;
158       ++result;
159     }  // for row
160   }    // for batch
161 }
162 
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)163 void PortableMatrixBatchVectorMultiplyAccumulate(
164     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
165     const int8_t* __restrict__ vectors, const float* scaling_factors,
166     int n_batch, float* __restrict__ result, const float* per_channel_scale,
167     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
168     bool* compute_row_sums, CpuBackendContext* context) {
169   if (input_offset == nullptr) {
170     PortableMatrixBatchVectorMultiplyAccumulate(
171         matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result);
172     return;
173   }
174   if (!compute_row_sums || *compute_row_sums) {
175     PortableReductionSumVector(matrix, row_sums, m_rows, m_cols);
176     if (compute_row_sums) {
177       *compute_row_sums = false;
178     }
179   }
180 
181   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
182     const float batch_scaling_factor = scaling_factors[batch];
183     const int32_t batch_offset = input_offset[batch];
184     const int8_t* row_ptr = matrix;
185     for (int row = 0; row < m_rows; ++row) {
186       int32_t dotprod = 0;
187       float scale = batch_scaling_factor;
188       if (per_channel_scale) {
189         scale *= per_channel_scale[row];
190       }
191 #if defined(__GNUC__)
192       // Prefetch the row to cache.
193       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
194                          3 /* temporal locality */);
195 #endif
196       for (int col = 0; col < m_cols; ++col, ++row_ptr) {
197         dotprod += (*row_ptr) * vectors[col];
198       }  // for col
199       dotprod -= row_sums[row] * batch_offset;
200       *result += dotprod * scale;
201       ++result;
202     }  // for row
203   }    // for batch
204 }
205 
PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)206 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
207     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
208     const int32_t* __restrict__ indices, int m_rows, int m_cols,
209     const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
210   const int kBlockSize = 4;
211   TFLITE_DCHECK_EQ(m_cols % kBlockSize, 0);
212   for (int batch = 0; batch < n_batch; batch++) {
213     const float* matrix_ptr = matrix;
214     for (int row = 0; row < m_rows; row++) {
215       float dot_prod = 0.0f;
216       const float* vector_in_batch = vector + batch * m_cols;
217       for (int i = segments[row]; i < segments[row + 1]; i++) {
218         const int block_start_index = indices[i] * kBlockSize;
219         const float* vector_block_in_batch_ptr =
220             vector_in_batch + block_start_index;
221         for (int c = 0; c < kBlockSize; c++) {
222           dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
223         }
224       }
225       result[batch * m_rows + row] += dot_prod;
226     }
227   }
228 }
229 
PortableSparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)230 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
231     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
232     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
233     float* __restrict__ result) {
234   const int kBlockSize = 16;
235   TFLITE_DCHECK_EQ(  // NOLINT
236       m_cols % kBlockSize, 0);
237   for (int batch = 0; batch < n_batch; batch++) {
238     const float* matrix_ptr = matrix;
239     const uint8_t* ledger_ptr = ledger;
240     for (int row = 0; row < m_rows; row++) {
241       float dot_prod = 0.0f;
242       int num_nonzero_blocks = *ledger_ptr++;
243       if (num_nonzero_blocks > 0) {
244         const float* vector_in_batch = vector + batch * m_cols;
245         for (int i = 0; i < num_nonzero_blocks; i++) {
246           const int block_start_index = *ledger_ptr++ * kBlockSize;
247           const float* vector_block_in_batch_ptr =
248               vector_in_batch + block_start_index;
249           for (int c = 0; c < kBlockSize; c++) {
250             dot_prod += *matrix_ptr++ * *vector_block_in_batch_ptr++;
251           }
252         }
253       }
254       result[batch * m_rows + row] += dot_prod;
255     }
256   }
257 }
258 
PortableSparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)259 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
260     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
261     const int m_cols, const int8_t* __restrict__ vectors,
262     const float* scaling_factors, int n_batch, float* __restrict__ result) {
263   static const int kBlockSize = 16;
264   TFLITE_DCHECK_EQ(  // NOLINT
265       m_cols % kBlockSize, 0);
266   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
267     const float batch_scaling_factor = scaling_factors[batch];
268     const uint8_t* ledger_ptr = ledger;
269     // Get the address of the first row.
270     const int8_t* row_ptr = matrix;
271     for (int row = 0; row < m_rows; ++row) {
272       // Initialize the dot product sum for the row to 0.
273       int32_t dotprod = 0;
274 #if defined(__GNUC__)
275       // Prefetch the row to cache.
276       __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
277                          3 /* temporal locality */);
278 #endif
279       int num_nonzero_blocks = *ledger_ptr++;
280       for (int i = 0; i < num_nonzero_blocks; i++) {
281         const int block_start_index = *ledger_ptr++ * kBlockSize;
282         const int8_t* vector_block_ptr = vectors + block_start_index;
283         for (int c = 0; c < kBlockSize; c++) {
284           dotprod += (*row_ptr++) * (*vector_block_ptr++);
285         }  // for block
286       }    // for num_nonzero_blocks
287       result[batch * m_rows + row] += dotprod * batch_scaling_factor;
288     }  // for row
289   }    // for batch
290 }
291 
292 template <typename T>
PortableMatrixBatchVectorMultiplyAccumulateImpl(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,T * output)293 void PortableMatrixBatchVectorMultiplyAccumulateImpl(
294     const int8_t* input, const int32_t* bias,
295     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
296     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
297     T* output) {
298   const int16_t output_max = std::numeric_limits<T>::max();
299   const int16_t output_min = std::numeric_limits<T>::min();
300   for (int batch = 0; batch < n_batch; ++batch) {
301     for (int row = 0; row < n_output; ++row) {
302       int32_t acc = bias[row];
303       for (int col = 0; col < n_input; ++col) {
304         int8_t input_val = input[batch * n_input + col];
305         int8_t weights_val = input_to_gate_weights[row * n_input + col];
306         acc += input_val * weights_val;
307       }
308       acc = MultiplyByQuantizedMultiplier(acc, multiplier, shift);
309       acc += output_zp;
310       acc += output[batch * n_output + row];
311       if (acc > output_max) {
312         acc = output_max;
313       }
314       if (acc < output_min) {
315         acc = output_min;
316       }
317       output[batch * n_output + row] = static_cast<T>(acc);
318     }
319   }
320 }
321 
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)322 void PortableMatrixBatchVectorMultiplyAccumulate(
323     const int8_t* input, const int32_t* bias,
324     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
325     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
326     int32_t* scratch, int16_t* output, CpuBackendContext* context) {
327   PortableMatrixBatchVectorMultiplyAccumulateImpl(
328       input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
329       n_output, output_zp, output);
330 }
331 
PortableMatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)332 void PortableMatrixBatchVectorMultiplyAccumulate(
333     const int8_t* input, const int32_t* bias,
334     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
335     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
336     int32_t* scratch, int8_t* output, CpuBackendContext* context) {
337   PortableMatrixBatchVectorMultiplyAccumulateImpl(
338       input, bias, input_to_gate_weights, multiplier, shift, n_batch, n_input,
339       n_output, output_zp, output);
340 }
341 
PortableMatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)342 void PortableMatrixBatchVectorMultiply(const int8_t* input,
343                                        int32_t input_zeropoint,
344                                        const int8_t* input_to_gate_weights,
345                                        int32_t input_to_gate_effective_scale_a,
346                                        int32_t input_to_gate_effective_scale_b,
347                                        int32_t n_batch, int32_t n_input,
348                                        int32_t n_cell, int8_t* gate_output,
349                                        int8_t gate_output_zp) {
350   const int32_t int8_max = std::numeric_limits<int8_t>::max();
351   const int32_t int8_min = std::numeric_limits<int8_t>::min();
352   for (int batch = 0; batch < n_batch; ++batch) {
353     for (int row = 0; row < n_cell; ++row) {
354       int32_t acc = 0;
355       for (int col = 0; col < n_input; ++col) {
356         int32_t input_val = input[batch * n_input + col];
357         int8_t weights_val = input_to_gate_weights[row * n_input + col];
358         acc += (input_val - input_zeropoint) * weights_val;
359       }
360       acc = MultiplyByQuantizedMultiplier(acc, input_to_gate_effective_scale_a,
361                                           input_to_gate_effective_scale_b);
362       acc += gate_output_zp;
363       if (acc > int8_max) {
364         acc = int8_max;
365       }
366       if (acc < int8_min) {
367         acc = int8_min;
368       }
369       gate_output[batch * n_cell + row] = static_cast<int8_t>(acc);
370     }
371   }
372 }
373 
PortableMatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)374 void PortableMatrixBatchVectorMultiply(
375     const int16_t* hidden, const int8_t* hidden_to_output_weights,
376     int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
377     const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
378     int32_t n_output, int32_t output_zp, int8_t* proj_output) {
379   const int16_t int8_max = std::numeric_limits<int8_t>::max();
380   const int16_t int8_min = std::numeric_limits<int8_t>::min();
381   for (int batch = 0; batch < n_batch; ++batch) {
382     for (int row = 0; row < n_output; ++row) {
383       int64_t acc = gate_bias[row];
384       for (int col = 0; col < n_hidden; ++col) {
385         int16_t input_val = hidden[batch * n_hidden + col];
386         int8_t weights_val = hidden_to_output_weights[row * n_hidden + col];
387         int64_t curr = acc;
388         acc += input_val * weights_val;
389         if (input_val * weights_val > 0 && acc < curr) {
390           acc = std::numeric_limits<int32_t>::max();
391         }
392         if (input_val * weights_val < 0 && acc > curr) {
393           acc = std::numeric_limits<int32_t>::min();
394         }
395       }
396       acc = MultiplyByQuantizedMultiplier(acc, proj_effective_scale_a,
397                                           proj_effective_scale_b);
398       acc += output_zp;
399       if (acc > int8_max) {
400         acc = int8_max;
401       }
402       if (acc < int8_min) {
403         acc = int8_min;
404       }
405       proj_output[batch * n_output + row] = acc;
406     }
407   }
408 }
409 
PortableApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)410 void PortableApplyLayerNorm(const int16_t* input,
411                             const int16_t* layer_norm_weights,
412                             const int32_t* bias, int32_t layer_norm_scale_a,
413                             int32_t layer_norm_scale_b, int32_t variance_limit,
414                             int n_batch, int n_input, int16_t* output) {
415   // The square of std::pow(2, 10), which is the extra factor that makes sure
416   // normalized values has enough resolution.
417   static const int kTwoToPower20 = 1 << 20;
418   for (int i = 0; i < n_batch; ++i) {
419     int64_t sum = 0;
420     int64_t sum_sq = 0;
421     for (int j = 0; j < n_input; ++j) {
422       const int32_t index = i * n_input + j;
423       int32_t val = static_cast<int32_t>(input[index]);
424       sum += val;
425       sum_sq += val * val;
426     }
427     int32_t mean =
428         static_cast<int32_t>(static_cast<int64_t>(sum) * 1024 / n_input);
429     // TODO(b/173994730): Avoids overflow but only works for POT n_input.
430     int32_t temp = kTwoToPower20 / n_input;
431     int64_t variance =
432         sum_sq * temp - static_cast<int64_t>(mean) * static_cast<int64_t>(mean);
433     int32_t variance2 = static_cast<int32_t>(variance / kTwoToPower20);
434     if (variance2 < 1) {
435       variance2 = variance_limit;
436     }
437     int32_t stddev_inverse_a;
438     int stddev_inverse_b;
439     GetInvSqrtQuantizedMultiplierExp(variance2, /*reverse_shift*/ -1,
440                                      &stddev_inverse_a, &stddev_inverse_b);
441 
442     for (int j = 0; j < n_input; ++j) {
443       const int32_t index = i * n_input + j;
444       int32_t val = static_cast<int32_t>(input[index]);
445       int32_t shifted = 1024 * val - mean;
446       int32_t rescaled = MultiplyByQuantizedMultiplier(
447           shifted, stddev_inverse_a, stddev_inverse_b);
448       // TODO(jianlijianli): Saturate this.
449       int64_t val3 = rescaled * layer_norm_weights[j] + bias[j];
450       int32_t val4 =
451           static_cast<int32_t>((val3 > 0 ? val3 + 512 : val3 - 512) / 1024);
452       int32_t val5 = MultiplyByQuantizedMultiplier(val4, layer_norm_scale_a,
453                                                    layer_norm_scale_b + 12);
454       val5 = std::min(std::max(kInt16Min, val5), kInt16Max);
455       output[index] = static_cast<int16_t>(val5);
456     }
457   }
458 }
459 
PortableApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)460 void PortableApplyLayerNormFloat(const int16_t* input,
461                                  const int16_t* layer_norm_weights,
462                                  int32_t layer_norm_scale_a,
463                                  int32_t layer_norm_scale_b,
464                                  const int32_t* bias, int n_batch, int n_input,
465                                  int16_t* output) {
466   const int32_t int16_max = std::numeric_limits<int16_t>::max();
467   const int32_t int16_min = std::numeric_limits<int16_t>::min();
468   // This is to surpress a lint warning.
469   const double two = 2.0;
470   const float layer_norm_scale =
471       layer_norm_scale_a *
472       std::pow(two, static_cast<double>(layer_norm_scale_b - 31));
473   const float bias_scale = std::pow(two, -10) * layer_norm_scale;
474 
475   for (int batch = 0; batch < n_batch; ++batch) {
476     float sum = 0.0f;
477     float sum_sq = 0.0f;
478     for (int i = 0; i < n_input; ++i) {
479       const int index = batch * n_input + i;
480       const float value = static_cast<float>(input[index]);
481       sum += value;
482       sum_sq += value * value;
483     }
484     const float mean = sum / n_input;
485     float stddev_inv = 0.0f;
486     const float variance = sum_sq / n_input - mean * mean;
487     if (variance == 0) {
488       stddev_inv = 1.0f / sqrt(1e-8);
489     } else {
490       stddev_inv = 1.0f / sqrt(variance);
491     }
492     for (int i = 0; i < n_input; ++i) {
493       const int index = batch * n_input + i;
494       const float normalized_value =
495           (static_cast<float>(input[index]) - mean) * stddev_inv;
496       const float weighted_normalized_value =
497           normalized_value * layer_norm_weights[i] * layer_norm_scale +
498           bias[i] * bias_scale;
499       const int32_t quant_output = static_cast<int32_t>(
500           std::round(weighted_normalized_value * std::pow(2, 12)));
501       output[index] = std::min(int16_max, std::max(int16_min, quant_output));
502     }
503   }
504 }
505 
PortableMatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)506 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
507                                             int32_t scalar, int32_t n_row,
508                                             int32_t n_col, int32_t* output) {
509   for (int i = 0; i < n_row; ++i) {
510     int32_t row_sum = 0;
511     for (int j = 0; j < n_col; ++j) {
512       row_sum += *matrix++;
513     }
514     output[i] += row_sum * scalar;
515   }
516 }
517 
PortableApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)518 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
519                           int32_t n_input, int16_t* output) {
520   for (int batch = 0; batch < n_batch; ++batch) {
521     for (int c = 0; c < n_input; c++) {
522       using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
523       using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
524       const int index = batch * n_input + c;
525       F3 sigmoid_input = F3::FromRaw(input[index]);
526       F0 sigmoid_output = gemmlowp::logistic(sigmoid_input);
527       output[index] = sigmoid_output.raw();
528     }
529   }
530 }
531 
PortableApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)532 void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
533                                int32_t n_input, int16_t* output) {
534   const int32_t int16_max = std::numeric_limits<int16_t>::max();
535   const int32_t int16_min = std::numeric_limits<int16_t>::min();
536   for (int batch = 0; batch < n_batch; ++batch) {
537     for (int i = 0; i < n_input; ++i) {
538       const int index = batch * n_input + i;
539       const float float_input = input[index] * std::pow(2, -12);
540       const float float_output = 1.0f / (1.0f + std::exp(-float_input));
541       const int32_t quant_output =
542           static_cast<int32_t>(float_output * std::pow(2, 15));
543       const int32_t quant_output_clamped =
544           std::min(int16_max, std::max(int16_min, quant_output));
545       output[index] = static_cast<int16_t>(quant_output_clamped);
546     }
547   }
548 }
549 
550 template <int IntegerBits>
PortableApplyTanhImpl(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)551 void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch,
552                            int32_t n_input, int16_t* output) {
553   using FX = gemmlowp::FixedPoint<std::int16_t, IntegerBits>;
554   using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
555   for (int batch = 0; batch < n_batch; ++batch) {
556     for (int i = 0; i < n_input; ++i) {
557       const int index = batch * n_input + i;
558       FX tanh_input = FX::FromRaw(input[index]);
559       F0 tanh_output = gemmlowp::tanh(tanh_input);
560       output[index] = tanh_output.raw();
561     }
562   }
563 }
564 
PortableApplyTanh(int32_t integer_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)565 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
566                        int32_t n_batch, int32_t n_input, int16_t* output) {
567   assert(integer_bits <= 6);
568 #define DISPATCH_TANH(i)                                       \
569   case i:                                                      \
570     PortableApplyTanhImpl<i>(input, n_batch, n_input, output); \
571     break;
572   switch (integer_bits) {
573     DISPATCH_TANH(0);
574     DISPATCH_TANH(1);
575     DISPATCH_TANH(2);
576     DISPATCH_TANH(3);
577     DISPATCH_TANH(4);
578     DISPATCH_TANH(5);
579     DISPATCH_TANH(6);
580     default:
581       return;
582   }
583 #undef DISPATCH_TANH
584 }
585 
PortableApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)586 void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
587                             int32_t n_input, int32_t integer_bits,
588                             int16_t* output) {
589   const int32_t int16_max = std::numeric_limits<int16_t>::max();
590   const int32_t int16_min = std::numeric_limits<int16_t>::min();
591   const double two = 2.0;
592   for (int batch = 0; batch < n_batch; ++batch) {
593     for (int i = 0; i < n_input; ++i) {
594       const int index = batch * n_input + i;
595       const float float_input =
596           input[index] * std::pow(two, static_cast<double>(integer_bits));
597       const float float_output = std::tanh(float_input);
598       const int32_t quant_output =
599           static_cast<int32_t>(float_output * std::pow(2, 15));
600       const int32_t quant_output_clamped =
601           std::min(int16_max, std::max(int16_min, quant_output));
602       output[index] = static_cast<int16_t>(quant_output_clamped);
603     }
604   }
605 }
606 
PortableCwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)607 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
608                       int n_batch, int n_input, int shift, int16_t* output) {
609   for (int batch = 0; batch < n_batch; ++batch) {
610     for (int i = 0; i < n_input; ++i) {
611       const int index = batch * n_input + i;
612       const int16_t a = input_1[index];
613       const int16_t b = input_2[index];
614       const int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
615       output[index] =
616           static_cast<int16_t>(gemmlowp::RoundingDivideByPOT(value, shift));
617     }
618   }
619 }
620 
PortableCwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t output_zp,int8_t * output)621 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
622                       int32_t multiplier, int32_t shift, int32_t n_batch,
623                       int32_t n_input, int32_t output_zp, int8_t* output) {
624   for (int batch = 0; batch < n_batch; ++batch) {
625     for (int i = 0; i < n_input; ++i) {
626       const int index = batch * n_input + i;
627       const int16_t a = input_1[index];
628       const int16_t b = input_2[index];
629       int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
630       value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
631       value -= output_zp;
632       value = std::min(std::max(static_cast<int32_t>(-128), value),
633                        static_cast<int32_t>(127));
634 
635       output[index] = static_cast<int8_t>(value);
636     }
637   }
638 }
639 
PortableCwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)640 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
641                       int n_batch, int n_input, int16_t* output) {
642   for (int batch = 0; batch < n_batch; ++batch) {
643     for (int i = 0; i < n_input; ++i) {
644       const int index = batch * n_input + i;
645       int32_t sum = input_1[index] + input_2[index];
646       const int32_t sum_clamped = std::min(kInt16Max, std::max(kInt16Min, sum));
647       output[index] = static_cast<int16_t>(sum_clamped);
648     }
649   }
650 }
651 
PortableVectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)652 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
653                                      int v_size) {
654   float result = 0.0;
655   for (int v = 0; v < v_size; v++) {
656     result += *vector1++ * *vector2++;
657   }
658   return result;
659 }
660 
661 namespace {
VectorVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size)662 inline int32_t VectorVectorDotProduct(const int16_t* vector1,
663                                       const int16_t* vector2, int v_size) {
664   int32_t result = 0;
665   for (int v = 0; v < v_size; v++) {
666     result += *vector1++ * *vector2++;
667   }
668   return result;
669 }
670 }  // namespace
671 
PortableBatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)672 void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
673                                               const int16_t* vector2,
674                                               int v_size, int n_batch,
675                                               int32_t* result) {
676   for (int b = 0; b < n_batch; b++) {
677     result[b] = VectorVectorDotProduct(vector1, vector2, v_size);
678     vector1 += v_size;
679     vector2 += v_size;
680   }
681 }
682 
PortableVectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)683 void PortableVectorBatchVectorCwiseProductAccumulate(
684     const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
685     int32_t multiplier, int shift, int16_t* result) {
686   for (int b = 0; b < n_batch; b++) {
687     for (int v = 0; v < v_size; v++) {
688       int32_t prod = vector[v] * *batch_vector++;
689       prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
690       int32_t output = prod + *result;
691       output = std::max(std::min(static_cast<int32_t>(32767), output),
692                         static_cast<int32_t>(-32768));
693       *result++ = output;
694     }
695   }
696 }
697 
PortableSub1Vector(const float * vector,int v_size,float * result)698 void PortableSub1Vector(const float* vector, int v_size, float* result) {
699   for (int v = 0; v < v_size; v++) {
700     *result++ = 1.0f - *vector++;
701   }
702 }
703 
PortableSub1Vector(const int16_t * vector,int v_size,int16_t * result)704 void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result) {
705   static const int16_t kOne = 32767;
706   for (int v = 0; v < v_size; v++) {
707     *result++ = kOne - *vector++;
708   }
709 }
710 
PortableVectorScalarMultiply(const int8_t * vector,const int v_size,const float scale,float * result)711 void PortableVectorScalarMultiply(const int8_t* vector, const int v_size,
712                                   const float scale, float* result) {
713   for (int v = 0; v < v_size; ++v) {
714     *result++ = scale * *vector++;
715   }
716 }
717 
PortableMeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)718 void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
719                                      float* __restrict__ output_vector,
720                                      int v_size, int n_batch) {
721   for (int batch = 0; batch < n_batch; ++batch) {
722     float sum = 0.0f;
723     for (int i = 0; i < v_size; ++i) {
724       sum += input_vector[i];
725     }
726     const float mean = sum / v_size;
727     float sum_diff_sq = 0.0f;
728     for (int i = 0; i < v_size; ++i) {
729       const float diff = input_vector[i] - mean;
730       sum_diff_sq += diff * diff;
731     }
732     const float variance = sum_diff_sq / v_size;
733     constexpr float kNormalizationConstant = 1e-8f;
734     const float stddev_inv =
735         1.0f / std::sqrt(variance + kNormalizationConstant);
736     for (int i = 0; i < v_size; ++i) {
737       output_vector[i] = (input_vector[i] - mean) * stddev_inv;
738     }
739     input_vector += v_size;
740     output_vector += v_size;
741   }
742 }
743 
PortableTwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)744 void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
745                                   const int8_t* recurrent, int8_t recurrent_zp,
746                                   int32_t input_effective_scale_a,
747                                   int32_t input_effective_scale_b,
748                                   int32_t recurrent_effective_scale_a,
749                                   int32_t recurrent_effective_scale_b,
750                                   int32_t n_batch, int32_t n_cell,
751                                   int16_t* output) {
752   const int32_t int16_max = std::numeric_limits<int16_t>::max();
753   const int32_t int16_min = std::numeric_limits<int16_t>::min();
754   for (int i = 0; i < n_batch * n_cell; ++i) {
755     int32_t x = static_cast<int32_t>(input[i]) - static_cast<int32_t>(input_zp);
756     int32_t h =
757         static_cast<int32_t>(recurrent[i]) - static_cast<int32_t>(recurrent_zp);
758     int32_t x_scaled = MultiplyByQuantizedMultiplier(x, input_effective_scale_a,
759                                                      input_effective_scale_b);
760     int32_t h_scaled = MultiplyByQuantizedMultiplier(
761         h, recurrent_effective_scale_a, recurrent_effective_scale_b);
762     int32_t y = h_scaled + x_scaled;
763     if (y > int16_max) {
764       y = int16_max;
765     }
766     if (y < int16_min) {
767       y = int16_min;
768     }
769     output[i] = static_cast<int16_t>(y);
770   }
771 }
772 
773 }  // namespace tensor_utils
774 }  // namespace tflite
775