1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
17
18 #include <algorithm>
19 #include <cmath>
20 #include <cstdint>
21
22 #include "Eigen/Core"
23 #include "tensorflow/lite/c/builtin_op_data.h"
24 #include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
25
26 #if defined(_MSC_VER)
27 #define __restrict__ __restrict
28 #endif
29
30 namespace tflite {
31
32 // Not all backends support CpuBackendContext usage, so forward declare to avoid
33 // pulling in its implementation. Use of CpuBackendContext in method
34 // implementations is purely optional.
35 class CpuBackendContext;
36
37 namespace tensor_utils {
38
39 // Same as the function above, but provide a scratch buffer for the
40 // int8 x int8 -> int32 and a CpuBackendContext for the accumulator
41 // computation.
42 void MatrixBatchVectorMultiplyAccumulate(
43 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
44 const int8_t* __restrict__ vectors,
45 const float* __restrict__ scaling_factors, int n_batch,
46 int32_t* __restrict__ scratch, float* __restrict__ result,
47 CpuBackendContext* __restrict__ context);
48
49 // Same as the function above except that can make use of cached row sums.
50 void MatrixBatchVectorMultiplyAccumulate(
51 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
52 const int8_t* __restrict__ vectors, const float* scaling_factors,
53 int n_batch, float* __restrict__ result, const float* per_channel_scale,
54 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
55 bool* compute_row_sums, CpuBackendContext* context);
56
57 // Same as the function above, but provides separate scaling factor for the
58 // matrix and the vectors. The scaling factors are multiplied in the
59 // scaling_factor_scratch buffer.
MatrixBatchVectorMultiplyAccumulate(const int8_t * __restrict__ matrix,const int m_rows,const int m_cols,const int8_t * __restrict__ vectors,const float matrix_scaling_factor,const float * vector_scaling_factors,int n_batch,float * __restrict__ result,const float * per_channel_scale,const int32_t * input_offset,int32_t * scratch,int32_t * row_sums,bool * compute_row_sums,float * scaling_factor_scratch,CpuBackendContext * context)60 inline void MatrixBatchVectorMultiplyAccumulate(
61 const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
62 const int8_t* __restrict__ vectors, const float matrix_scaling_factor,
63 const float* vector_scaling_factors, int n_batch,
64 float* __restrict__ result, const float* per_channel_scale,
65 const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
66 bool* compute_row_sums, float* scaling_factor_scratch,
67 CpuBackendContext* context) {
68 for (int b = 0; b < n_batch; ++b) {
69 scaling_factor_scratch[b] =
70 vector_scaling_factors[b] * matrix_scaling_factor;
71 }
72 MatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
73 scaling_factor_scratch, n_batch, result,
74 per_channel_scale, input_offset, scratch,
75 row_sums, compute_row_sums, context);
76 }
77
78 // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
79 // dimension composed by input vectors independent from each other). The result
80 // of the multiplication is accumulated to the passed result buffer.
81 // More specifically, for a matrix M of shape [n, i] and a batched-vector
82 // of shape [i, batch] it will first compute the product of shape [n, batch].
83 // This product will be accumulated to the result buffer,
84 // Parameters:
85 // - input: batch vector of size n_batch * n_input
86 // - bias: vector of size b_input
87 // - input_to_gate_weights: matrix of size n_input * n_output
88 // - multiplier: scalar
89 // - shift: scalar
90 // - n_batch: the batch size
91 // - n_input: the input size
92 // - n_output: the output size
93 // - output_zp: the zero point of the output.
94 // - scratch: batch vector of size n_batch * n_output
95 // - output: the 16 bit output
96 // Notes:
97 // - this is used for gate matmul: for non-cifg it is for input, forget,
98 // cell, output gates; for cifg, it is for forget, cell, output gates.
99 // - multiplier and shift combined gives the scale.
100 // - assumes input zero point is 0.
101 // - scratch is created for optimization purpose only.
102 // TODO(b/152066492): this can be removed if some future optimization
103 // work makes it unnecessary.
104 void MatrixBatchVectorMultiplyAccumulate(
105 const int8_t* input, const int32_t* bias,
106 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
107 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
108 int32_t* scratch, int16_t* output, CpuBackendContext* context);
109
110 // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
111 // dimension composed by input vectors independent from each other). The result
112 // of the multiplication is accumulated to the passed result buffer.
113 // More specifically, for a matrix M of shape [n, i] and a batched-vector
114 // of shape [i, batch] it will first compute the product of shape [n, batch].
115 // This product will be accumulated to the result buffer,
116 // Parameters:
117 // - input: batch vector of size n_batch * n_input
118 // - bias: vector of size b_input
119 // - input_to_gate_weights: matrix of size n_input * n_output
120 // - multiplier: scalar
121 // - shift: scalar
122 // - n_batch: the batch size
123 // - n_input: the input size
124 // - n_output: the output size
125 // - output_zp: the zero point of the output.
126 // - scratch: batch vector of size n_batch * n_output
127 // - output: the 8 bit output
128 // Notes:
129 // - this is used for projection matmul.
130 // - multiplier and shift combined gives the scale.
131 // - assumes input zero point is 0.
132 // - scratch is created for optimization purpose only.
133 // TODO(b/152066492): this can be removed if some future optimization
134 // work makes it unnecessary.
135 void MatrixBatchVectorMultiplyAccumulate(
136 const int8_t* input, const int32_t* bias,
137 const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift,
138 int32_t n_batch, int32_t n_input, int32_t n_output, int32_t output_zp,
139 int32_t* scratch, int8_t* output, CpuBackendContext* context);
140
141 // Apply Rectified Linear to elements of a vector.
ApplyReluToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)142 inline void ApplyReluToVector(const float* __restrict__ vector, int v_size,
143 float* __restrict__ result) {
144 for (int v = 0; v < v_size; v++) {
145 result[v] = std::max(0.0f, vector[v]);
146 }
147 }
148
149 // Apply Rectified Linear 1 (cap to [-1;1]) to elements of a vector
ApplyRelu1ToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)150 inline void ApplyRelu1ToVector(const float* __restrict__ vector, int v_size,
151 float* __restrict__ result) {
152 for (int v = 0; v < v_size; v++) {
153 result[v] = std::max(-1.0f, std::min(vector[v], 1.0f));
154 }
155 }
156
157 // Apply Rectified Linear 6 (cap to [0;6]) to elements of a vector
ApplyRelu6ToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)158 inline void ApplyRelu6ToVector(const float* __restrict__ vector, int v_size,
159 float* __restrict__ result) {
160 for (int v = 0; v < v_size; v++) {
161 result[v] = std::max(0.0f, std::min(vector[v], 6.0f));
162 }
163 }
164
165 // Apply tanh to elements of a vector
ApplyTanhToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)166 inline void ApplyTanhToVector(const float* __restrict__ vector, int v_size,
167 float* __restrict__ result) {
168 using VectorMap = Eigen::Map<Eigen::Vector<float, Eigen::Dynamic>>;
169 VectorMap input_map(const_cast<float* __restrict__>(vector), v_size);
170 VectorMap output_map(result, v_size);
171 output_map.array() = input_map.array().tanh();
172 }
173
174 // Apply signbit to elements of a vector
ApplySignbitToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)175 inline void ApplySignbitToVector(const float* __restrict__ vector, int v_size,
176 float* __restrict__ result) {
177 for (int v = 0; v < v_size; v++) {
178 result[v] = std::signbit(vector[v]);
179 }
180 }
181
182 // Apply sigmoid to elements of a vector.
ApplySigmoidToVector(const float * __restrict__ vector,int v_size,float * __restrict__ result)183 inline void ApplySigmoidToVector(const float* __restrict__ vector, int v_size,
184 float* __restrict__ result) {
185 using VectorMap = Eigen::Map<Eigen::Vector<float, Eigen::Dynamic>>;
186 VectorMap input_map(const_cast<float* __restrict__>(vector), v_size);
187 VectorMap output_map(result, v_size);
188 output_map.array() = input_map.array().logistic();
189 }
190
191 // Apply appropriate activation function to elements of a vector.
ApplyActivationToVector(const float * __restrict__ vector,int v_size,TfLiteFusedActivation activation,float * __restrict__ result)192 inline void ApplyActivationToVector(const float* __restrict__ vector,
193 int v_size,
194 TfLiteFusedActivation activation,
195 float* __restrict__ result) {
196 switch (activation) {
197 case kTfLiteActNone:
198 return;
199 case kTfLiteActRelu:
200 return ApplyReluToVector(vector, v_size, result);
201 case kTfLiteActReluN1To1:
202 return ApplyRelu1ToVector(vector, v_size, result);
203 case kTfLiteActRelu6:
204 return ApplyRelu6ToVector(vector, v_size, result);
205 case kTfLiteActTanh:
206 return ApplyTanhToVector(vector, v_size, result);
207 case kTfLiteActSignBit:
208 return ApplySignbitToVector(vector, v_size, result);
209 case kTfLiteActSigmoid:
210 return ApplySigmoidToVector(vector, v_size, result);
211 }
212 }
213
214 } // namespace tensor_utils
215 } // namespace tflite
216
217 #endif // TENSORFLOW_LITE_KERNELS_INTERNAL_TENSOR_UTILS_H_
218