1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
17
18 #include "tensorflow/lite/kernels/cpu_backend_context.h"
19 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
20 #include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
21 #include "tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h"
22 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
23
24 namespace tflite {
25 namespace tensor_utils {
26
MatrixBatchVectorMultiplyAccumulate(const float * matrix,int m_rows,int m_cols,const float * vector,int n_batch,float * result)27 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
28 int m_cols, const float* vector,
29 int n_batch, float* result) {
30 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
31 vector, n_batch, result);
32 }
33
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)34 void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
35 const int m_rows, const int m_cols,
36 const int8_t* __restrict__ vectors,
37 const float* scaling_factors,
38 int n_batch,
39 float* __restrict__ result) {
40 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
41 vectors, scaling_factors, n_batch, result);
42 }
43
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,int32_t * scratch,float * __restrict__ result,CpuBackendContext * context)44 void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
45 const int m_rows, const int m_cols,
46 const int8_t* __restrict__ vectors,
47 const float* scaling_factors,
48 int n_batch, int32_t* scratch,
49 float* __restrict__ result,
50 CpuBackendContext* context) {
51 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
52 vectors, scaling_factors, n_batch, scratch, result, context);
53 }
54
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,CpuBackendContext * context)55 void MatrixBatchVectorMultiplyAccumulate(
56 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
57 const int8_t* __restrict__ vectors, const float* scaling_factors,
58 int n_batch, float* __restrict__ result, const float* per_channel_scale,
59 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
60 bool* compute_row_sums, CpuBackendContext* context) {
61 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
62 vectors, scaling_factors, n_batch, result, per_channel_scale,
63 input_offset, scratch, row_sums, compute_row_sums, context);
64 }
65
SparseMatrixBatchVectorMultiplyAccumulate1x4(const float * __restrict__ matrix,const int32_t * __restrict__ segments,const int32_t * __restrict__ indices,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)66 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
67 const float* __restrict__ matrix, const int32_t* __restrict__ segments,
68 const int32_t* __restrict__ indices, int m_rows, int m_cols,
69 const float* __restrict__ vector, int n_batch, float* __restrict__ result) {
70 NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x4, matrix,
71 segments, indices, m_rows, m_cols, vector, n_batch, result);
72 }
73
SparseMatrixBatchVectorMultiplyAccumulate(const float * __restrict__ matrix,const uint8_t * __restrict__ ledger,int m_rows,int m_cols,const float * __restrict__ vector,int n_batch,float * __restrict__ result)74 void SparseMatrixBatchVectorMultiplyAccumulate(
75 const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
76 int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
77 float* __restrict__ result) {
78 NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
79 m_rows, m_cols, vector, n_batch, result);
80 }
81
SparseMatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const uint8_t * ledger,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float * scaling_factors,int n_batch,float * __restrict__ result)82 void SparseMatrixBatchVectorMultiplyAccumulate(
83 const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
84 const int m_cols, const int8_t* __restrict__ vectors,
85 const float* scaling_factors, int n_batch, float* __restrict__ result) {
86 NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate, matrix, ledger,
87 m_rows, m_cols, vectors, scaling_factors, n_batch, result);
88 }
89
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int16_t * output,CpuBackendContext * context)90 void MatrixBatchVectorMultiplyAccumulate(
91 const int8_t* input, const int32_t* bias,
92 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
93 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
94 int32_t* scratch, int16_t* output, CpuBackendContext* context) {
95 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, input, bias,
96 input_to_gate_weights, multiplier, shift, n_batch, n_input,
97 n_output, output_zp, scratch, output, context);
98 }
99
MatrixBatchVectorMultiplyAccumulate(const int8_t * input,const int32_t * bias,const int8_t * input_to_gate_weights,int32_t multiplier,int32_t shift,int32_t n_batch,int32_t n_input,int32_t n_output,int32_t output_zp,int32_t * scratch,int8_t * output,CpuBackendContext * context)100 void MatrixBatchVectorMultiplyAccumulate(
101 const int8_t* input, const int32_t* bias,
102 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
103 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
104 int32_t* scratch, int8_t* output, CpuBackendContext* context) {
105 NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, input, bias,
106 input_to_gate_weights, multiplier, shift, n_batch, n_input,
107 n_output, output_zp, scratch, output, context);
108 }
109
MatrixBatchVectorMultiply(const int8_t * input,int32_t input_zeropoint,const int8_t * input_to_gate_weights,int32_t input_to_gate_effective_scale_a,int32_t input_to_gate_effective_scale_b,int32_t n_batch,int32_t n_input,int32_t n_cell,int8_t * gate_output,int8_t gate_output_zp)110 void MatrixBatchVectorMultiply(const int8_t* input, int32_t input_zeropoint,
111 const int8_t* input_to_gate_weights,
112 int32_t input_to_gate_effective_scale_a,
113 int32_t input_to_gate_effective_scale_b,
114 int32_t n_batch, int32_t n_input, int32_t n_cell,
115 int8_t* gate_output, int8_t gate_output_zp) {
116 PortableMatrixBatchVectorMultiply(
117 input, input_zeropoint, input_to_gate_weights,
118 input_to_gate_effective_scale_a, input_to_gate_effective_scale_b, n_batch,
119 n_input, n_cell, gate_output, gate_output_zp);
120 }
121
MatrixBatchVectorMultiply(const int16_t * hidden,const int8_t * hidden_to_output_weights,int32_t proj_effective_scale_a,int32_t proj_effective_scale_b,const int32_t * gate_bias,int32_t n_batch,int32_t n_hidden,int32_t n_output,int32_t output_zp,int8_t * proj_output)122 void MatrixBatchVectorMultiply(const int16_t* hidden,
123 const int8_t* hidden_to_output_weights,
124 int32_t proj_effective_scale_a,
125 int32_t proj_effective_scale_b,
126 const int32_t* gate_bias, int32_t n_batch,
127 int32_t n_hidden, int32_t n_output,
128 int32_t output_zp, int8_t* proj_output) {
129 PortableMatrixBatchVectorMultiply(hidden, hidden_to_output_weights,
130 proj_effective_scale_a,
131 proj_effective_scale_b, gate_bias, n_batch,
132 n_hidden, n_output, output_zp, proj_output);
133 }
134
MatrixScalarMultiplyAccumulate(const int8_t * matrix,int32_t scalar,int32_t n_row,int32_t n_col,int32_t * output)135 void MatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
136 int32_t n_row, int32_t n_col,
137 int32_t* output) {
138 NEON_OR_PORTABLE(MatrixScalarMultiplyAccumulate, matrix, scalar, n_row, n_col,
139 output);
140 }
141
ApplyLayerNorm(const int16_t * input,const int16_t * layer_norm_weights,const int32_t * bias,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,int32_t variance_limit,int n_batch,int n_input,int16_t * output)142 void ApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
143 const int32_t* bias, int32_t layer_norm_scale_a,
144 int32_t layer_norm_scale_b, int32_t variance_limit,
145 int n_batch, int n_input, int16_t* output) {
146 NEON_OR_PORTABLE(ApplyLayerNorm, input, layer_norm_weights, bias,
147 layer_norm_scale_a, layer_norm_scale_b, variance_limit,
148 n_batch, n_input, output);
149 }
150
ApplyLayerNormFloat(const int16_t * input,const int16_t * layer_norm_weights,int32_t layer_norm_scale_a,int32_t layer_norm_scale_b,const int32_t * bias,int n_batch,int n_input,int16_t * output)151 void ApplyLayerNormFloat(const int16_t* input,
152 const int16_t* layer_norm_weights,
153 int32_t layer_norm_scale_a, int32_t layer_norm_scale_b,
154 const int32_t* bias, int n_batch, int n_input,
155 int16_t* output) {
156 PortableApplyLayerNormFloat(input, layer_norm_weights, layer_norm_scale_a,
157 layer_norm_scale_b, bias, n_batch, n_input,
158 output);
159 }
160
ApplySigmoid(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)161 void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
162 int16_t* output) {
163 NEON_OR_PORTABLE(ApplySigmoid, input, n_batch, n_input, output);
164 }
165
ApplySigmoidFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)166 void ApplySigmoidFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
167 int16_t* output) {
168 PortableApplySigmoidFloat(input, n_batch, n_input, output);
169 }
170
ApplyTanh(int32_t integer_bits,const int16_t * input,int32_t n_batch,int32_t n_input,int16_t * output)171 void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
172 int32_t n_input, int16_t* output) {
173 NEON_OR_PORTABLE(ApplyTanh, integer_bits, input, n_batch, n_input, output);
174 }
175
ApplyTanhFloat(const int16_t * input,int32_t n_batch,int32_t n_input,int32_t integer_bits,int16_t * output)176 void ApplyTanhFloat(const int16_t* input, int32_t n_batch, int32_t n_input,
177 int32_t integer_bits, int16_t* output) {
178 PortableApplyTanhFloat(input, n_batch, n_input, integer_bits, output);
179 }
180
CwiseMul(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int shift,int16_t * output)181 void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
182 int n_input, int shift, int16_t* output) {
183 NEON_OR_PORTABLE(CwiseMul, input_1, input_2, n_batch, n_input, shift, output);
184 }
185
CwiseMul(const int16_t * input_1,const int16_t * input_2,int32_t multiplier,int shift,int n_batch,int n_input,int32_t output_zp,int8_t * output)186 void CwiseMul(const int16_t* input_1, const int16_t* input_2,
187 int32_t multiplier, int shift, int n_batch, int n_input,
188 int32_t output_zp, int8_t* output) {
189 NEON_OR_PORTABLE(CwiseMul, input_1, input_2, multiplier, shift, n_batch,
190 n_input, output_zp, output);
191 }
192
CwiseAdd(const int16_t * input_1,const int16_t * input_2,int n_batch,int n_input,int16_t * output)193 void CwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
194 int n_input, int16_t* output) {
195 NEON_OR_PORTABLE(CwiseAdd, input_1, input_2, n_batch, n_input, output);
196 }
197
CwiseClipping(float * vector,const int v_size,const float clipping_value)198 void CwiseClipping(float* vector, const int v_size,
199 const float clipping_value) {
200 NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
201 }
CwiseClipping(int16_t * vector,const int v_size,const int16_t clipping_value)202 void CwiseClipping(int16_t* vector, const int v_size,
203 const int16_t clipping_value) {
204 NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
205 }
CwiseClipping(int8_t * vector,const int v_size,const int8_t clipping_value)206 void CwiseClipping(int8_t* vector, const int v_size,
207 const int8_t clipping_value) {
208 NEON_OR_PORTABLE(CwiseClipping, vector, v_size, clipping_value);
209 }
210
BatchVectorBatchVectorDotProduct(const int16_t * vector1,const int16_t * vector2,int v_size,int n_batch,int32_t * result)211 void BatchVectorBatchVectorDotProduct(const int16_t* vector1,
212 const int16_t* vector2, int v_size,
213 int n_batch, int32_t* result) {
214 PortableBatchVectorBatchVectorDotProduct(vector1, vector2, v_size, n_batch,
215 result);
216 }
217
VectorBatchVectorCwiseProductAccumulate(const int16_t * vector,int v_size,const int16_t * batch_vector,int n_batch,int32_t multiplier,int shift,int16_t * result)218 void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size,
219 const int16_t* batch_vector,
220 int n_batch, int32_t multiplier,
221 int shift, int16_t* result) {
222 NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size,
223 batch_vector, n_batch, multiplier, shift, result);
224 }
225
VectorVectorDotProduct(const float * vector1,const float * vector2,int v_size)226 float VectorVectorDotProduct(const float* vector1, const float* vector2,
227 int v_size) {
228 return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size);
229 }
230
Sub1Vector(const float * vector,int v_size,float * result)231 void Sub1Vector(const float* vector, int v_size, float* result) {
232 NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
233 }
234
Sub1Vector(const int16_t * vector,int v_size,int16_t * result)235 void Sub1Vector(const int16_t* vector, int v_size, int16_t* result) {
236 NEON_OR_PORTABLE(Sub1Vector, vector, v_size, result);
237 }
238
239 // Check if all entries of a vector are zero for float.
IsZeroVector(const float * vector,int v_size)240 bool IsZeroVector(const float* vector, int v_size) {
241 return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
242 }
243
244 // Check if all entries of a vector are zero for int8.
IsZeroVector(const int8_t * vector,int v_size)245 bool IsZeroVector(const int8_t* vector, int v_size) {
246 return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
247 }
248
VectorScalarMultiply(const int8_t * vector,int v_size,float scale,float * result)249 void VectorScalarMultiply(const int8_t* vector, int v_size, float scale,
250 float* result) {
251 NEON_OR_PORTABLE(VectorScalarMultiply, vector, v_size, scale, result);
252 }
253
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * min_value,float * max_value,float * scaling_factor)254 void SymmetricQuantizeFloats(const float* values, const int size,
255 int8_t* quantized_values, float* min_value,
256 float* max_value, float* scaling_factor) {
257 NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
258 min_value, max_value, scaling_factor);
259 }
260
SymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float min_value,float max_value,float * scaling_factor)261 void SymmetricQuantizeFloats(const float* values, const int size,
262 int8_t* quantized_values, float min_value,
263 float max_value, float* scaling_factor) {
264 NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
265 min_value, max_value, scaling_factor);
266 }
267
AsymmetricQuantizeFloats(const float * values,const int size,int8_t * quantized_values,float * scaling_factor,int32_t * offset)268 void AsymmetricQuantizeFloats(const float* values, const int size,
269 int8_t* quantized_values, float* scaling_factor,
270 int32_t* offset) {
271 NEON_OR_PORTABLE(AsymmetricQuantizeFloats, values, size, quantized_values,
272 scaling_factor, offset);
273 }
274
ReductionSumVector(const float * input_vector,float * output_vector,int output_size,int reduction_size)275 void ReductionSumVector(const float* input_vector, float* output_vector,
276 int output_size, int reduction_size) {
277 NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
278 reduction_size);
279 }
280
ReductionSumVector(const int32_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)281 void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
282 int output_size, int reduction_size) {
283 PortableReductionSumVector(input_vector, output_vector, output_size,
284 reduction_size);
285 }
286
ReductionSumVector(const int8_t * input_vector,int32_t * output_vector,int output_size,int reduction_size)287 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
288 int output_size, int reduction_size) {
289 NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
290 reduction_size);
291 }
292
MeanStddevNormalization(const float * __restrict__ input_vector,float * __restrict__ output_vector,int v_size,int n_batch)293 void MeanStddevNormalization(const float* __restrict__ input_vector,
294 float* __restrict__ output_vector, int v_size,
295 int n_batch) {
296 NEON_OR_PORTABLE(MeanStddevNormalization, input_vector, output_vector, v_size,
297 n_batch);
298 }
299
TwoGateSaturatingAdd(const int8_t * input,int8_t input_zp,const int8_t * recurrent,int8_t recurrent_zp,int32_t input_effective_scale_a,int32_t input_effective_scale_b,int32_t recurrent_effective_scale_a,int32_t recurrent_effective_scale_b,int32_t n_batch,int32_t n_cell,int16_t * output)300 void TwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
301 const int8_t* recurrent, int8_t recurrent_zp,
302 int32_t input_effective_scale_a,
303 int32_t input_effective_scale_b,
304 int32_t recurrent_effective_scale_a,
305 int32_t recurrent_effective_scale_b, int32_t n_batch,
306 int32_t n_cell, int16_t* output) {
307 PortableTwoGateSaturatingAdd(
308 input, input_zp, recurrent, recurrent_zp, input_effective_scale_a,
309 input_effective_scale_b, recurrent_effective_scale_a,
310 recurrent_effective_scale_b, n_batch, n_cell, output);
311 }
312
313 } // namespace tensor_utils
314 } // namespace tflite
315
316 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_H_
317