• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
17 
18 #include "tensorflow/lite/kernels/cpu_backend_context.h"
19 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
20 
21 #if defined(_MSC_VER)
22 #define __restrict__ __restrict
23 #endif
24 
25 namespace tflite {
26 namespace tensor_utils {
27 
28 #ifdef USE_NEON
29 
30 // Multiply a matrix by a batch vector, and store results in a batch-size
31 // vector.
32 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
33                                              int m_cols, const float* vector,
34                                              int n_batch, float* result);
35 
36 // Matrix multiplication for quantized values using symmetric quantization.
37 void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
38                                              const int m_rows, const int m_cols,
39                                              const int8_t* __restrict__ vectors,
40                                              const float* scaling_factors,
41                                              int n_batch,
42                                              float* __restrict__ result);
43 
44 // Same as above but with a scratch buffer and CpuBackendContext for the
45 // int8 x int8 -> int32 accumulation computation
46 void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
47                                              const int m_rows, const int m_cols,
48                                              const int8_t* __restrict__ vectors,
49                                              const float* scaling_factors,
50                                              int n_batch, int32_t* scratch,
51                                              float* __restrict__ result,
52                                              CpuBackendContext* context);
53 
54 // Matrix multiplication for quantized values using asymmetric quantization.
55 void NeonMatrixBatchVectorMultiplyAccumulate(
56     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
57     const int8_t* __restrict__ vectors, const float* scaling_factors,
58     int n_batch, float* __restrict__ result, const float* per_channel_scale,
59     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
60     bool* compute_row_sums, CpuBackendContext* context);
61 
62 void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
63                         const int32_t* bias, int32_t layer_norm_scale_a,
64                         int32_t layer_norm_scale_b, int32_t variance_limit,
65                         int n_batch, int n_input, int16_t* output);
66 
67 void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input,
68                       int16_t* output);
69 
70 void NeonApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch,
71                    int32_t n_input, int16_t* output);
72 
73 void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch,
74                   int n_input, int shift, int16_t* output);
75 
76 void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2,
77                   int32_t multiplier, int shift, int n_batch, int n_input,
78                   int32_t output_zp, int8_t* output);
79 
80 void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch,
81                   int n_input, int16_t* output);
82 
83 void NeonCwiseClipping(float* vector, const int v_size,
84                        const float clipping_value);
85 void NeonCwiseClipping(int16_t* vector, const int v_size,
86                        const int16_t clipping_value);
87 void NeonCwiseClipping(int8_t* vector, const int v_size,
88                        const int8_t clipping_value);
89 
90 void NeonMatrixBatchVectorMultiplyAccumulate(
91     const int8_t* input, const int32_t* bias,
92     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
93     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
94     int32_t* scratch, int8_t* output, CpuBackendContext* context);
95 
96 void NeonMatrixBatchVectorMultiplyAccumulate(
97     const int8_t* input, const int32_t* bias,
98     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
99     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
100     int32_t* scratch, int16_t* output, CpuBackendContext* context);
101 
102 void NeonMatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar,
103                                         int32_t n_row, int32_t n_col,
104                                         int32_t* output);
105 
106 void NeonSparseMatrixBatchVectorMultiplyAccumulate1x4(
107     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
108     const int32_t* __restrict__ indices, int m_rows, int m_cols,
109     const float* __restrict__ vector, int n_batch, float* __restrict__ result);
110 
111 // Multiply a matrix by a batch vector, and store results in a batch-size
112 // vector. Sparse version.
113 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
114     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
115     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
116     float* __restrict__ result);
117 
118 // Matrix multiplication for quantized values using symmetric quantization.
119 // Sparse version.
120 void NeonSparseMatrixBatchVectorMultiplyAccumulate(
121     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
122     const int m_cols, const int8_t* __restrict__ vectors,
123     const float* scaling_factors, int n_batch, float* __restrict__ result);
124 
125 // Dot product of two vectors.
126 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
127                                  int v_size);
128 
129 // Compute "1.0f - elements of vector" (used in CIFG).
130 void NeonSub1Vector(const float* vector, int v_size, float* result);
131 
132 void NeonSub1Vector(const int16_t* vector, int v_size, int16_t* result);
133 
134 // Multiply all elements of vector with a scalar.
135 void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
136                               float* result);
137 
138 // Check if all entries of a vector are zero.
139 bool NeonIsZeroVector(const float* vector, int v_size);
140 
141 // Check if all entries of a vector are zero.
142 bool NeonIsZeroVector(const int8_t* vector, int v_size);
143 
144 // Symmetric quantizer.
145 void NeonSymmetricQuantizeFloats(const float* values, const int size,
146                                  int8_t* quantized_values, float* min,
147                                  float* max, float* scaling_factor);
148 
149 // Symmetric quantizer.
150 void NeonSymmetricQuantizeFloats(const float* values, const int size,
151                                  int8_t* quantized_values, float min, float max,
152                                  float* scaling_factor);
153 
154 // Asymmetric quantizer.
155 void NeonAsymmetricQuantizeFloats(const float* values, const int size,
156                                   int8_t* quantized_values,
157                                   float* scaling_factor, int32_t* offset);
158 
159 // Reduce-sum on a float input vector:
160 // input_vector: float pointer to input vector.
161 // output_vector: float pointer to vector.
162 // output_size: output vector size.
163 // reduction_size: number of consecutive elements from input vector which are
164 // added to get one element of output.
165 void NeonReductionSumVector(const float* input_vector, float* output_vector,
166                             int output_size, int reduction_size);
167 
168 void NeonReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
169                             int output_size, int reduction_size);
170 
171 void NeonVectorBatchVectorCwiseProductAccumulate(
172     const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
173     int32_t multiplier, int shift, int16_t* result);
174 
175 // Layer norm for each batch.
176 void NeonMeanStddevNormalization(const float* __restrict__ input_vector,
177                                  float* __restrict__ output_vector, int v_size,
178                                  int n_batch);
179 
180 #endif  // USE_NEON
181 
182 }  // namespace tensor_utils
183 }  // namespace tflite
184 
185 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_
186