• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
17 
18 #include <algorithm>
19 #include <cstdint>
20 
21 #if defined(_MSC_VER)
22 #define __restrict__ __restrict
23 #endif
24 
25 namespace tflite {
26 
27 // Not all backends support CpuBackendContext usage, so forward declare to avoid
28 // pulling in its implementation.
29 class CpuBackendContext;
30 
31 namespace tensor_utils {
32 
33 template <typename T>
PortableIsZeroVector(const T * vector,int v_size)34 bool PortableIsZeroVector(const T* vector, int v_size) {
35   for (int i = 0; i < v_size; ++i) {
36     if (vector[i] != 0) {
37       return false;
38     }
39   }
40   return true;
41 }
42 
43 void PortableSymmetricQuantizeFloats(const float* values, const int size,
44                                      int8_t* quantized_values, float* min_value,
45                                      float* max_value, float* scaling_factor);
46 
47 void PortableSymmetricQuantizeFloats(const float* values, const int size,
48                                      int8_t* quantized_values, float min_value,
49                                      float max_value, float* scaling_factor);
50 
51 void PortableAsymmetricQuantizeFloats(const float* values, const int size,
52                                       int8_t* quantized_values,
53                                       float* scaling_factor, int32_t* offset);
54 
55 // Multiply a matrix by a batch vector, and store results in a batch-size
56 // vector.
57 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
58                                                  int m_rows, int m_cols,
59                                                  const float* vector,
60                                                  int n_batch, float* result);
61 
62 void PortableMatrixBatchVectorMultiplyAccumulate(
63     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
64     const int8_t* __restrict__ vectors, const float* scaling_factors,
65     int n_batch, float* __restrict__ result);
66 
67 void PortableMatrixBatchVectorMultiplyAccumulate(
68     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
69     const int8_t* __restrict__ vectors, const float* scaling_factors,
70     int n_batch, float* __restrict__ result, const float* per_channel_scale,
71     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
72     bool* compute_row_sums, CpuBackendContext* context);
73 
74 void PortableMatrixBatchVectorMultiplyAccumulate(
75     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
76     const int8_t* __restrict__ vector, const float* scaling_factors,
77     int n_batch, int32_t* scratch, float* __restrict__ result,
78     CpuBackendContext* context);
79 
80 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
81     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
82     const int32_t* __restrict__ indices, int m_rows, int m_cols,
83     const float* __restrict__ vector, int n_batch, float* __restrict__ result);
84 
85 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
86     const float* __restrict__ matrix, const uint8_t* __restrict__ ledger,
87     int m_rows, int m_cols, const float* __restrict__ vector, int n_batch,
88     float* __restrict__ result);
89 
90 void PortableSparseMatrixBatchVectorMultiplyAccumulate(
91     const int8_t* __restrict__ matrix, const uint8_t* ledger, const int m_rows,
92     const int m_cols, const int8_t* __restrict__ vectors,
93     const float* scaling_factors, int n_batch, float* __restrict__ result);
94 
95 // Dot product of two vectors.
96 float PortableVectorVectorDotProduct(const float* vector1, const float* vector2,
97                                      int v_size);
98 
99 void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1,
100                                               const int16_t* vector2,
101                                               int v_size, int n_batch,
102                                               int32_t* result);
103 
104 void PortableVectorBatchVectorCwiseProductAccumulate(
105     const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch,
106     int32_t multiplier, int shift, int16_t* result);
107 
108 void PortableMatrixBatchVectorMultiplyAccumulate(
109     const int8_t* input, const int32_t* bias,
110     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
111     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
112     int32_t* scratch, int16_t* output, CpuBackendContext* context);
113 
114 void PortableMatrixBatchVectorMultiplyAccumulate(
115     const int8_t* input, const int32_t* bias,
116     const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
117     int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
118     int32_t* scratch, int8_t* output, CpuBackendContext* context);
119 
120 void PortableMatrixBatchVectorMultiply(const int8_t* input,
121                                        int32_t input_zeropoint,
122                                        const int8_t* input_to_gate_weights,
123                                        int32_t input_to_gate_effective_scale_a,
124                                        int32_t input_to_gate_effective_scale_b,
125                                        int32_t n_batch, int32_t n_input,
126                                        int32_t n_cell, int8_t* gate_output,
127                                        int8_t gate_output_zp);
128 
129 void PortableMatrixBatchVectorMultiply(
130     const int16_t* hidden, const int8_t* hidden_to_output_weights,
131     int32_t proj_effective_scale_a, int32_t proj_effective_scale_b,
132     const int32_t* gate_bias, int32_t n_batch, int32_t n_hidden,
133     int32_t n_output, int32_t output_zp, int8_t* proj_output);
134 
135 void PortableMatrixScalarMultiplyAccumulate(const int8_t* matrix,
136                                             int32_t scalar, int32_t n_row,
137                                             int32_t n_col, int32_t* output);
138 
139 void PortableApplyLayerNorm(const int16_t* input,
140                             const int16_t* layer_norm_weights,
141                             const int32_t* bias, int32_t layer_norm_scale_a,
142                             int32_t layer_norm_scale_b, int32_t variance_limit,
143                             int n_batch, int n_input, int16_t* output);
144 
145 void PortableApplyLayerNormFloat(const int16_t* input,
146                                  const int16_t* layer_norm_weights,
147                                  int32_t layer_norm_scale_a,
148                                  int32_t layer_norm_scale_b,
149                                  const int32_t* bias, int n_batch, int n_input,
150                                  int16_t* output);
151 
152 void PortableApplySigmoid(const int16_t* input, int32_t n_batch,
153                           int32_t n_input, int16_t* output);
154 
155 void PortableApplySigmoidFloat(const int16_t* input, int32_t n_batch,
156                                int32_t n_input, int16_t* output);
157 
158 void PortableApplyTanh(int32_t integer_bits, const int16_t* input,
159                        int32_t n_batch, int32_t n_input, int16_t* output);
160 
161 void PortableApplyTanhFloat(const int16_t* input, int32_t n_batch,
162                             int32_t n_input, int32_t integer_bits,
163                             int16_t* output);
164 
165 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
166                       int n_batch, int n_input, int shift, int16_t* output);
167 
168 void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
169                       int32_t multiplier, int32_t shift, int32_t n_batch,
170                       int32_t n_input, int32_t output_zp, int8_t* output);
171 
172 void PortableCwiseAdd(const int16_t* input_1, const int16_t* input_2,
173                       int n_batch, int n_input, int16_t* output);
174 
175 template <typename T>
PortableCwiseClipping(T * vector,const int v_size,const T & clipping_value)176 void PortableCwiseClipping(T* vector, const int v_size,
177                            const T& clipping_value) {
178   for (int i = 0; i < v_size; i++) {
179     vector[i] = std::max(std::min(clipping_value, vector[i]),
180                          static_cast<T>(-clipping_value));
181   }
182 }
183 
184 // Batch vector initialization with another vector.
185 void PortableVectorBatchVectorAssign(const float* vector, int v_size,
186                                      int n_batch, float* batch_vector);
187 
188 // Compute "1.0f - elements of vector" (used in CIFG).
189 void PortableSub1Vector(const float* vector, int v_size, float* result);
190 
191 void PortableSub1Vector(const int16_t* vector, int v_size, int16_t* result);
192 
193 // Multiply all elements of vector with a scalar.
194 void PortableVectorScalarMultiply(const int8_t* vector, int v_size, float scale,
195                                   float* result);
196 
197 // Reduce-sum on a vector:
198 // input_vector: pointer to input vector.
199 // output_vector: pointer to vector.
200 // output_size: output vector size.
201 // reduction_size: number of consecutive elements from input vector which are
202 // added to get one element of output.
203 template <typename IN, typename OUT>
PortableReductionSumVector(const IN * input_vector,OUT * output_vector,int output_size,int reduction_size)204 void PortableReductionSumVector(const IN* input_vector, OUT* output_vector,
205                                 int output_size, int reduction_size) {
206   for (int o = 0; o < output_size; o++) {
207     OUT result = 0;
208     for (int r = 0; r < reduction_size; r++) {
209       result += input_vector[r];
210     }
211     output_vector[o] = result;
212     input_vector += reduction_size;
213   }
214 }
215 
216 // Layer norm for each batch.
217 void PortableMeanStddevNormalization(const float* __restrict__ input_vector,
218                                      float* __restrict__ output_vector,
219                                      int v_size, int n_batch);
220 
221 // Saturate Add.
222 void PortableTwoGateSaturatingAdd(const int8_t* input, int8_t input_zp,
223                                   const int8_t* recurrent, int8_t recurrent_zp,
224                                   int32_t input_effective_scale_a,
225                                   int32_t input_effective_scale_b,
226                                   int32_t recurrent_effective_scale_a,
227                                   int32_t recurrent_effective_scale_b,
228                                   int32_t n_batch, int32_t n_cell,
229                                   int16_t* output);
230 
231 }  // namespace tensor_utils
232 }  // namespace tflite
233 
234 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PORTABLE_TENSOR_UTILS_IMPL_H_
235