1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_ 16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_ 17 18 #include "tensorflow/lite/kernels/cpu_backend_context.h" 19 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h" 20 21 #if defined(_MSC_VER) 22 #define __restrict__ __restrict 23 #endif 24 25 namespace tflite { 26 namespace tensor_utils { 27 28 #ifdef USE_NEON 29 30 // Multiply a matrix by a batch vector, and store results in a batch-size 31 // vector. 32 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows, 33 int m_cols, const float* vector, 34 int n_batch, float* result); 35 36 // Matrix multiplication for quantized values using symmetric quantization. 37 void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix, 38 const int m_rows, const int m_cols, 39 const int8_t* __restrict__ vectors, 40 const float* scaling_factors, 41 int n_batch, 42 float* __restrict__ result); 43 44 // Same as above but with a scratch buffer and CpuBackendContext for the 45 // int8 x int8 -> int32 accumulation computation 46 void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix, 47 const int m_rows, const int m_cols, 48 const int8_t* __restrict__ vectors, 49 const float* scaling_factors, 50 int n_batch, int32_t* scratch, 51 float* __restrict__ result, 52 CpuBackendContext* context); 53 54 // Matrix multiplication for quantized values using asymmetric quantization. 55 void NeonMatrixBatchVectorMultiplyAccumulate( 56 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols, 57 const int8_t* __restrict__ vectors, const float* scaling_factors, 58 int n_batch, float* __restrict__ result, const float* per_channel_scale, 59 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums, 60 bool* compute_row_sums, CpuBackendContext* context); 61 62 void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights, 63 const int32_t* bias, int32_t layer_norm_scale_a, 64 int32_t layer_norm_scale_b, int32_t variance_limit, 65 int n_batch, int n_input, int16_t* output); 66 67 void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, 68 int16_t* output); 69 70 void NeonApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch, 71 int32_t n_input, int16_t* output); 72 73 void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, 74 int n_input, int shift, int16_t* output); 75 76 void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, 77 int32_t multiplier, int shift, int n_batch, int n_input, 78 int32_t output_zp, int8_t* output); 79 80 void NeonCwiseAdd(const int16_t* input_1, const int16_t* input_2, int n_batch, 81 int n_input, int16_t* output); 82 83 void NeonCwiseClipping(float* vector, const int v_size, 84 const float clipping_value); 85 void NeonCwiseClipping(int16_t* vector, const int v_size, 86 const int16_t clipping_value); 87 void NeonCwiseClipping(int8_t* vector, const int v_size, 88 const int8_t clipping_value); 89 90 void NeonMatrixBatchVectorMultiplyAccumulate( 91 const int8_t* input, const int32_t* bias, 92 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift, 93 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp, 94 int32_t* scratch, int8_t* output, CpuBackendContext* context); 95 96 void NeonMatrixBatchVectorMultiplyAccumulate( 97 const int8_t* input, const int32_t* bias, 98 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift, 99 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp, 100 int32_t* scratch, int16_t* output, CpuBackendContext* context); 101 102 void NeonMatrixScalarMultiplyAccumulate(const int8_t* matrix, int32_t scalar, 103 int32_t n_row, int32_t n_col, 104 int32_t* output); 105 106 void NeonSparseMatrixBatchVectorMultiplyAccumulate1x4( 107 const float* __restrict__ matrix, const int32_t* __restrict__ segments, 108 const int32_t* __restrict__ indices, int m_rows, int m_cols, 109 const float* __restrict__ vector, int n_batch, float* __restrict__ result); 110 111 // Multiply a matrix by a batch vector, and store results in a batch-size 112 // vector. Sparse version. 113 void NeonSparseMatrixBatchVectorMultiplyAccumulate( 114 const float* __restrict__ matrix, const uint8_t* __restrict__ ledger, 115 int m_rows, int m_cols, const float* __restrict__ vector, int n_batch, 116 float* __restrict__ result); 117 118 // Multiplies a symmetric quantized matrix by a quantized batch vector. The 119 // matrix is stored in sparse format. 120 void NeonSparseMatrixBatchVectorMultiplyAccumulate1x16( 121 const int8_t* __restrict__ matrix, const int32_t* __restrict__ segments, 122 const int32_t* __restrict__ indices, int m_rows, int m_cols, 123 const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector, 124 int n_batch, const int32_t input_offset, const int32_t output_multiplier, 125 const int32_t output_shift, const int32_t output_offset, 126 const int32_t output_activation_min, const int32_t output_activation_max, 127 int8_t* __restrict__ result); 128 129 // Matrix multiplication for quantized values using symmetric quantization. 130 // Sparse version. 131 void NeonSparseMatrixBatchVectorMultiplyAccumulate( 132 const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows, 133 const int m_cols, const int8_t* __restrict__ vectors, 134 const float* scaling_factors, int n_batch, float* __restrict__ result); 135 136 // Dot product of two vectors. 137 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2, 138 int v_size); 139 140 // Compute "1.0f - elements of vector" (used in CIFG). 141 void NeonSub1Vector(const float* vector, int v_size, float* result); 142 143 void NeonSub1Vector(const int16_t* vector, int v_size, int16_t* result); 144 145 // Multiply all elements of vector with a scalar. 146 void NeonVectorScalarMultiply(const int8_t* vector, int v_size, float scale, 147 float* result); 148 149 // Check if all entries of a vector are zero. 150 bool NeonIsZeroVector(const float* vector, int v_size); 151 152 // Check if all entries of a vector are zero. 153 bool NeonIsZeroVector(const int8_t* vector, int v_size); 154 155 // Symmetric quantizer. 156 void NeonSymmetricQuantizeFloats(const float* values, const int size, 157 int8_t* quantized_values, float* min, 158 float* max, float* scaling_factor); 159 160 // Symmetric quantizer. 161 void NeonSymmetricQuantizeFloats(const float* values, const int size, 162 int8_t* quantized_values, float min, float max, 163 float* scaling_factor); 164 165 // Asymmetric quantizer. 166 void NeonAsymmetricQuantizeFloats(const float* values, const int size, 167 int8_t* quantized_values, 168 float* scaling_factor, int32_t* offset); 169 170 // Reduce-sum on a float input vector: 171 // input_vector: float pointer to input vector. 172 // output_vector: float pointer to vector. 173 // output_size: output vector size. 174 // reduction_size: number of consecutive elements from input vector which are 175 // added to get one element of output. 176 void NeonReductionSumVector(const float* input_vector, float* output_vector, 177 int output_size, int reduction_size); 178 179 void NeonReductionSumVector(const int8_t* input_vector, int32_t* output_vector, 180 int output_size, int reduction_size); 181 182 void NeonVectorBatchVectorCwiseProductAccumulate( 183 const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch, 184 int32_t multiplier, int shift, int16_t* result); 185 186 // Layer norm for each batch. 187 void NeonMeanStddevNormalization(const float* __restrict__ input_vector, 188 float* __restrict__ output_vector, int v_size, 189 int n_batch); 190 191 #endif // USE_NEON 192 193 } // namespace tensor_utils 194 } // namespace tflite 195 196 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_NEON_TENSOR_UTILS_IMPL_H_ 197