1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
17
18 #include <algorithm>
19 #include <cstdint>
20
21 #if defined(_MSC_VER)
22 #define __restrict__ __restrict
23 #endif
24
25 namespace tflite {
26
27 // Not all backends support CpuBackendContext usage, so forward declare to avoid
28 // pulling in its implementation.
29 class CpuBackendContext;
30
31 namespace tensor_utils {
32
33 template <typename T>
PortableIsZeroVector(const T * vector,int v_size)34 bool PortableIsZeroVector(const T* vector, int v_size) {
35 for (int i = 0; i < v_size; ++i) {
36 if (vector[i] != 0) {
37 return false;
38 }
39 }
40 return true;
41 }
42
43 void PortableSymmetricQuantizeFloats(const float* values, const int size,
44 int8_t* quantized_values, float* min_value,
45 float* max_value, float* scaling_factor);
46
47 void PortableSymmetricQuantizeFloats(const float* values, const int size,
48 int8_t* quantized_values, float min_value,
49 float max_value, float* scaling_factor);
50
51 void PortableAsymmetricQuantizeFloats(const float* values, const int size,
52 int8_t* quantized_values,
53 float* scaling_factor, int32_t* offset);
54
55 // Multiply a matrix by a batch vector, and store results in a batch-size
56 // vector.
57 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
58 int m_rows, int m_cols,
59 const float* vector,
60 int n_batch, float* result);
61
62 void PortableMatrixBatchVectorMultiplyAccumulate(
63 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
64 const int8_t* __restrict__ vectors, const float* scaling_factors,
65 int n_batch, float* __restrict__ result);
66
67 void PortableMatrixBatchVectorMultiplyAccumulate(
68 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
69 const int8_t* __restrict__ vectors, const float* scaling_factors,
70 int n_batch, float* __restrict__ result, const float* per_channel_scale,
71 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
72 bool* compute_row_sums, CpuBackendContext* context);
73
74 void PortableMatrixBatchVectorMultiplyAccumulate(
75 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
76 const int8_t* __restrict__ vector, const float* scaling_factors,
77 int n_batch, int32_t* scratch, float* __restrict__ result,
78 CpuBackendContext* context);
79
80 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
81 const float* __restrict__ matrix, const int32_t* __restrict__ segments,
82 const int32_t* __restrict__ indices, int m_rows, int m_cols,
83 const float* __restrict__ vector, int n_batch, float* __restrict__ result);
84
85 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
86 const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
87 int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
88 float* __restrict__ result);
89
90 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
91 const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
92 const int m_cols, const int8_t* __restrict__ vectors,
93 const float* scaling_factors, int n_batch, float* __restrict__ result);
94
95 // Dot product of two vectors.
96 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
97 int v_size);
98
99 void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
100 const int16_t* vector2,
101 int v_size, int n_batch,
102 int32_t* result);
103
104 void PortableVectorBatchVectorCwiseProductAccumulate(
105 const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
106 int32_t multiplier, int shift, int16_t* result);
107
108 void PortableMatrixBatchVectorMultiplyAccumulate(
109 const int8_t* input, const int32_t* bias,
110 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
111 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
112 int32_t* scratch, int16_t* output, CpuBackendContext* context);
113
114 void PortableMatrixBatchVectorMultiplyAccumulate(
115 const int8_t* input, const int32_t* bias,
116 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
117 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
118 int32_t* scratch, int8_t* output, CpuBackendContext* context);
119
120 void PortableMatrixBatchVectorMultiply(const int8_t* input,
121 int32_t input_zeropoint,
122 const int8_t* input_to_gate_weights,
123 int32_t input_to_gate_effective_scale_a,
124 int32_t input_to_gate_effective_scale_b,
125 int32_t n_batch, int32_t n_input,
126 int32_t n_cell, int8_t* gate_output,
127 int8_t gate_output_zp);
128
129 void PortableMatrixBatchVectorMultiply(
130 const int16_t* hidden, const int8_t* hidden_to_output_weights,
131 int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
132 const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
133 int32_t n_output, int32_t output_zp, int8_t* proj_output);
134
135 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
136 int32_t scalar, int32_t n_row,
137 int32_t n_col, int32_t* output);
138
139 void PortableApplyLayerNorm(const int16_t* input,
140 const int16_t* layer_norm_weights,
141 const int32_t* bias, int32_t layer_norm_scale_a,
142 int32_t layer_norm_scale_b, int32_t variance_limit,
143 int n_batch, int n_input, int16_t* output);
144
145 void PortableApplyLayerNormFloat(const int16_t* input,
146 const int16_t* layer_norm_weights,
147 int32_t layer_norm_scale_a,
148 int32_t layer_norm_scale_b,
149 const int32_t* bias, int n_batch, int n_input,
150 int16_t* output);
151
152 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
153 int32_t n_input, int16_t* output);
154
155 void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
156 int32_t n_input, int16_t* output);
157
158 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
159 int32_t n_batch, int32_t n_input, int16_t* output);
160
161 void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
162 int32_t n_input, int32_t integer_bits,
163 int16_t* output);
164
165 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
166 int n_batch, int n_input, int shift, int16_t* output);
167
168 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
169 int32_t multiplier, int32_t shift, int32_t n_batch,
170 int32_t n_input, int32_t output_zp, int8_t* output);
171
172 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
173 int n_batch, int n_input, int16_t* output);
174
175 template <typename T>
PortableCwiseClipping(T * vector,const int v_size,const T & clipping_value)176 void PortableCwiseClipping(T* vector, const int v_size,
177 const T& clipping_value) {
178 for (int i = 0; i < v_size; i++) {
179 vector[i] = std::max(std::min(clipping_value, vector[i]),
180 static_cast<T>(-clipping_value));
181 }
182 }
183
184 // Batch vector initialization with another vector.
185 void PortableVectorBatchVectorAssign(const float* vector, int v_size,
186 int n_batch, float* batch_vector);
187
188 // Compute "1.0f - elements of vector" (used in CIFG).
189 void PortableSub1Vector(const float* vector, int v_size, float* result);
190
191 void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
192
193 // Multiply all elements of vector with a scalar.
194 void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
195 float* result);
196
197 // Reduce-sum on a vector:
198 // input_vector: pointer to input vector.
199 // output_vector: pointer to vector.
200 // output_size: output vector size.
201 // reduction_size: number of consecutive elements from input vector which are
202 // added to get one element of output.
203 template <typename IN, typename OUT>
PortableReductionSumVector(const IN * input_vector,OUT * output_vector,int output_size,int reduction_size)204 void PortableReductionSumVector(const IN* input_vector, OUT* output_vector,
205 int output_size, int reduction_size) {
206 for (int o = 0; o < output_size; o++) {
207 OUT result = 0;
208 for (int r = 0; r < reduction_size; r++) {
209 result += input_vector[r];
210 }
211 output_vector[o] = result;
212 input_vector += reduction_size;
213 }
214 }
215
216 // Layer norm for each batch.
217 void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
218 float* __restrict__ output_vector,
219 int v_size, int n_batch);
220
221 // Saturate Add.
222 void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
223 const int8_t* recurrent, int8_t recurrent_zp,
224 int32_t input_effective_scale_a,
225 int32_t input_effective_scale_b,
226 int32_t recurrent_effective_scale_a,
227 int32_t recurrent_effective_scale_b,
228 int32_t n_batch, int32_t n_cell,
229 int16_t* output);
230
231 } // namespace tensor_utils
232 } // namespace tflite
233
234 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
235