• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
17 
18 #include "tensorflow/lite/kernels/cpu_backend_context.h"
19 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
20 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
21 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
22 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
23 
24 namespace tflite {
25 namespace tensor_utils {
26 
MatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)27 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
28                                          int m_cols, const float* vector,
29                                          int n_batch, float* result) {
30   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
31                    vector, n_batch, result);
32 }
33 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)34 void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
35                                          const int m_rows, const int m_cols,
36                                          const int8_t* __restrict__ vectors,
37                                          const float* scaling_factors,
38                                          int n_batch,
39                                          float* __restrict__ result) {
40   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
41                    vectors, scaling_factors, n_batch, result);
42 }
43 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,int32_t * scratch,float * __restrict__ result,CpuBackendContext * context)44 void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
45                                          const int m_rows, const int m_cols,
46                                          const int8_t* __restrict__ vectors,
47                                          const float* scaling_factors,
48                                          int n_batch, int32_t* scratch,
49                                          float* __restrict__ result,
50                                          CpuBackendContext* context) {
51   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
52                    vectors, scaling_factors, n_batch, scratch, result, context);
53 }
54 
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)55 void MatrixBatchVectorMultiplyAccumulate(
56     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
57     const int8_t* __restrict__ vectors, const float* scaling_factors,
58     int n_batch, float* __restrict__ result, const float* per_channel_scale,
59     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
60     bool* compute_row_sums, CpuBackendContext* context) {
61   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
62                    vectors, scaling_factors, n_batch, result, per_channel_scale,
63                    input_offset, scratch, row_sums, compute_row_sums, context);
64 }
65 
SparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)66 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
67     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
68     const int32_t* __restrict__ indices, int m_rows, int m_cols,
69     const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
70   NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x4, matrix,
71                    segments, indices, m_rows, m_cols, vector, n_batch, result);
72 }
73 
SparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)74 void SparseMatrixBatchVectorMultiplyAccumulate(
75     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
76     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
77     float* __restrict__ result) {
78   NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
79                    m_rows, m_cols, vector, n_batch, result);
80 }
81 
SparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)82 void SparseMatrixBatchVectorMultiplyAccumulate(
83     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
84     const int m_cols, const int8_t* __restrict__ vectors,
85     const float* scaling_factors, int n_batch, float* __restrict__ result) {
86   NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
87                    m_rows, m_cols, vectors, scaling_factors, n_batch, result);
88 }
89 
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)90 void MatrixBatchVectorMultiplyAccumulate(
91     const int8_t* input, const int32_t* bias,
92     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
93     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
94     int32_t* scratch, int16_t* output, CpuBackendContext* context) {
95   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, input, bias,
96                    input_to_gate_weights, multiplier, shift, n_batch, n_input,
97                    n_output, output_zp, scratch, output, context);
98 }
99 
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)100 void MatrixBatchVectorMultiplyAccumulate(
101     const int8_t* input, const int32_t* bias,
102     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
103     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
104     int32_t* scratch, int8_t* output, CpuBackendContext* context) {
105   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, input, bias,
106                    input_to_gate_weights, multiplier, shift, n_batch, n_input,
107                    n_output, output_zp, scratch, output, context);
108 }
109 
MatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)110 void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
111                                const int8_t* input_to_gate_weights,
112                                int32_t input_to_gate_effective_scale_a,
113                                int32_t input_to_gate_effective_scale_b,
114                                int32_t n_batch, int32_t n_input, int32_t n_cell,
115                                int8_t* gate_output, int8_t gate_output_zp) {
116   PortableMatrixBatchVectorMultiply(
117       input, input_zeropoint, input_to_gate_weights,
118       input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
119       n_input, n_cell, gate_output, gate_output_zp);
120 }
121 
MatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)122 void MatrixBatchVectorMultiply(const int16_t* hidden,
123                                const int8_t* hidden_to_output_weights,
124                                int32_t proj_effective_scale_a,
125                                int32_t proj_effective_scale_b,
126                                const int32_t* gate_bias, int32_t n_batch,
127                                int32_t n_hidden, int32_t n_output,
128                                int32_t output_zp, int8_t* proj_output) {
129   PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
130                                     proj_effective_scale_a,
131                                     proj_effective_scale_b, gate_bias, n_batch,
132                                     n_hidden, n_output, output_zp, proj_output);
133 }
134 
MatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)135 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
136                                     int32_t n_row, int32_t n_col,
137                                     int32_t* output) {
138   NEON_OR_PORTABLE(MatrixScalarMultiplyAccumulate, matrix, scalar, n_row, n_col,
139                    output);
140 }
141 
ApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)142 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
143                     const int32_t* bias, int32_t layer_norm_scale_a,
144                     int32_t layer_norm_scale_b, int32_t variance_limit,
145                     int n_batch, int n_input, int16_t* output) {
146   NEON_OR_PORTABLE(ApplyLayerNorm, input, layer_norm_weights, bias,
147                    layer_norm_scale_a, layer_norm_scale_b, variance_limit,
148                    n_batch, n_input, output);
149 }
150 
ApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)151 void ApplyLayerNormFloat(const int16_t* input,
152                          const int16_t* layer_norm_weights,
153                          int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
154                          const int32_t* bias, int n_batch, int n_input,
155                          int16_t* output) {
156   PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
157                               layer_norm_scale_b, bias, n_batch, n_input,
158                               output);
159 }
160 
ApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)161 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
162                   int16_t* output) {
163   NEON_OR_PORTABLE(ApplySigmoid, input, n_batch, n_input, output);
164 }
165 
ApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)166 void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
167                        int16_t* output) {
168   PortableApplySigmoidFloat(input, n_batch, n_input, output);
169 }
170 
ApplyTanh(int32_t integer_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)171 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
172                int32_t n_input, int16_t* output) {
173   NEON_OR_PORTABLE(ApplyTanh, integer_bits, input, n_batch, n_input, output);
174 }
175 
ApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)176 void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
177                     int32_t integer_bits, int16_t* output) {
178   PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
179 }
180 
CwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)181 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
182               int n_input, int shift, int16_t* output) {
183   NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
184 }
185 
CwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int shift,int n_batch,int n_input,int32_t output_zp,int8_t * output)186 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
187               int32_t multiplier, int shift, int n_batch, int n_input,
188               int32_t output_zp, int8_t* output) {
189   NEON_OR_PORTABLE(CwiseMul, input_1, input_2, multiplier, shift, n_batch,
190                    n_input, output_zp, output);
191 }
192 
CwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)193 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
194               int n_input, int16_t* output) {
195   NEON_OR_PORTABLE(CwiseAdd, input_1, input_2, n_batch, n_input, output);
196 }
197 
CwiseClipping(float * vector,const int v_size,const float clipping_value)198 void CwiseClipping(float* vector, const int v_size,
199                    const float clipping_value) {
200   NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
201 }
CwiseClipping(int16_t * vector,const int v_size,const int16_t clipping_value)202 void CwiseClipping(int16_t* vector, const int v_size,
203                    const int16_t clipping_value) {
204   NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
205 }
CwiseClipping(int8_t * vector,const int v_size,const int8_t clipping_value)206 void CwiseClipping(int8_t* vector, const int v_size,
207                    const int8_t clipping_value) {
208   NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
209 }
210 
BatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)211 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
212                                       const int16_t* vector2, int v_size,
213                                       int n_batch, int32_t* result) {
214   PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
215                                            result);
216 }
217 
VectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)218 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
219                                              const int16_t* batch_vector,
220                                              int n_batch, int32_t multiplier,
221                                              int shift, int16_t* result) {
222   NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
223                    batch_vector, n_batch, multiplier, shift, result);
224 }
225 
VectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)226 float VectorVectorDotProduct(const float* vector1, const float* vector2,
227                              int v_size) {
228   return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
229 }
230 
Sub1Vector(const float * vector,int v_size,float * result)231 void Sub1Vector(const float* vector, int v_size, float* result) {
232   NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
233 }
234 
Sub1Vector(const int16_t * vector,int v_size,int16_t * result)235 void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
236   NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
237 }
238 
239 // Check if all entries of a vector are zero for float.
IsZeroVector(const float * vector,int v_size)240 bool IsZeroVector(const float* vector, int v_size) {
241   return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
242 }
243 
244 // Check if all entries of a vector are zero for int8.
IsZeroVector(const int8_t * vector,int v_size)245 bool IsZeroVector(const int8_t* vector, int v_size) {
246   return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
247 }
248 
VectorScalarMultiply(const int8_t * vector,int v_size,float scale,float * result)249 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
250                           float* result) {
251   NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
252 }
253 
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min_value,float * max_value,float * scaling_factor)254 void SymmetricQuantizeFloats(const float* values, const int size,
255                              int8_t* quantized_values, float* min_value,
256                              float* max_value, float* scaling_factor) {
257   NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
258                    min_value, max_value, scaling_factor);
259 }
260 
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)261 void SymmetricQuantizeFloats(const float* values, const int size,
262                              int8_t* quantized_values, float min_value,
263                              float max_value, float* scaling_factor) {
264   NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
265                    min_value, max_value, scaling_factor);
266 }
267 
AsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)268 void AsymmetricQuantizeFloats(const float* values, const int size,
269                               int8_t* quantized_values, float* scaling_factor,
270                               int32_t* offset) {
271   NEON_OR_PORTABLE(AsymmetricQuantizeFloats, values, size, quantized_values,
272                    scaling_factor, offset);
273 }
274 
ReductionSumVector(const float * input_vector,float * output_vector,int output_size,int reduction_size)275 void ReductionSumVector(const float* input_vector, float* output_vector,
276                         int output_size, int reduction_size) {
277   NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
278                    reduction_size);
279 }
280 
ReductionSumVector(const int32_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)281 void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
282                         int output_size, int reduction_size) {
283   PortableReductionSumVector(input_vector, output_vector, output_size,
284                              reduction_size);
285 }
286 
ReductionSumVector(const int8_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)287 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
288                         int output_size, int reduction_size) {
289   NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
290                    reduction_size);
291 }
292 
MeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)293 void MeanStddevNormalization(const float* __restrict__ input_vector,
294                              float* __restrict__ output_vector, int v_size,
295                              int n_batch) {
296   NEON_OR_PORTABLE(MeanStddevNormalization, input_vector, output_vector, v_size,
297                    n_batch);
298 }
299 
TwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)300 void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
301                           const int8_t* recurrent, int8_t recurrent_zp,
302                           int32_t input_effective_scale_a,
303                           int32_t input_effective_scale_b,
304                           int32_t recurrent_effective_scale_a,
305                           int32_t recurrent_effective_scale_b, int32_t n_batch,
306                           int32_t n_cell, int16_t* output) {
307   PortableTwoGateSaturatingAdd(
308       input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
309       input_effective_scale_b, recurrent_effective_scale_a,
310       recurrent_effective_scale_b, n_batch, n_cell, output);
311 }
312 
313 }  // namespace tensor_utils
314 }  // namespace tflite
315 
316 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
317