1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
16
17 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
18
19 namespace tflite {
20 namespace kernel_utils {
21
RnnBatchStep(const float * input_ptr_batch,const float * input_weights_ptr,const float * recurrent_weights_ptr,const float * bias_ptr,int input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,float * hidden_state_ptr_batch,float * output_ptr_batch)22 void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
23 const float* recurrent_weights_ptr, const float* bias_ptr,
24 int input_size, int num_units, int batch_size,
25 int output_batch_leading_dim,
26 TfLiteFusedActivation activation,
27 float* hidden_state_ptr_batch, float* output_ptr_batch) {
28 RnnBatchStep(input_ptr_batch, input_weights_ptr,
29 /*aux_input_ptr_batch=*/nullptr,
30 /*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr,
31 bias_ptr, input_size, /*aux_input_size=*/0, num_units,
32 batch_size, output_batch_leading_dim, activation,
33 hidden_state_ptr_batch, output_ptr_batch);
34 }
35
RnnBatchStep(const float * input_ptr_batch,const float * input_weights_ptr,const float * aux_input_ptr_batch,const float * aux_input_weights_ptr,const float * recurrent_weights_ptr,const float * bias_ptr,int input_size,int aux_input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,float * hidden_state_ptr_batch,float * output_ptr_batch)36 void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
37 const float* aux_input_ptr_batch,
38 const float* aux_input_weights_ptr,
39 const float* recurrent_weights_ptr, const float* bias_ptr,
40 int input_size, int aux_input_size, int num_units,
41 int batch_size, int output_batch_leading_dim,
42 TfLiteFusedActivation activation,
43 float* hidden_state_ptr_batch, float* output_ptr_batch) {
44 // Since the output batch rows may not be contiguous (output_batch_leading_dim
45 // != n_output), we unroll the batched operations where this is the case.
46 if (output_batch_leading_dim == num_units) {
47 // Output = bias
48 tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
49 output_ptr_batch);
50
51 // Output += input * input_weights
52 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
53 input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
54 output_ptr_batch, /*result_stride=*/1);
55
56 // Output += aux_input * aux_input_weights (if they are not empty).
57 if (aux_input_size > 0) {
58 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
59 aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
60 batch_size, output_ptr_batch, /*result_stride=*/1);
61 }
62
63 // Output += recurrent_weights * hidden_state
64 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
65 recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
66 batch_size, output_ptr_batch, /*result_stride=*/1);
67
68 // Output = activation(Output) and update hidden_state
69 tensor_utils::ApplyActivationToVector(
70 output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
71 tensor_utils::CopyVector(output_ptr_batch, num_units * batch_size,
72 hidden_state_ptr_batch);
73 } else {
74 // Output = bias
75 for (int k = 0; k < batch_size; k++) {
76 tensor_utils::CopyVector(bias_ptr, num_units,
77 output_ptr_batch + k * output_batch_leading_dim);
78 }
79
80 // Output += input * input_weights
81 for (int k = 0; k < batch_size; k++) {
82 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
83 input_weights_ptr, num_units, input_size,
84 input_ptr_batch + k * input_size, /*n_batch=*/1,
85 output_ptr_batch + k * output_batch_leading_dim, /*result_stride=*/1);
86 }
87
88 // Output += aux_input * aux_input_weights (if they are not empty).
89 if (aux_input_size > 0) {
90 for (int k = 0; k < batch_size; k++) {
91 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
92 aux_input_weights_ptr, num_units, aux_input_size,
93 aux_input_ptr_batch + k * aux_input_size,
94 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
95 /*result_stride=*/1);
96 }
97 }
98
99 // Output += recurrent_weights * hidden_state
100 for (int k = 0; k < batch_size; k++) {
101 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
102 recurrent_weights_ptr, num_units, num_units,
103 hidden_state_ptr_batch + k * num_units,
104 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
105 /*result_stride=*/1);
106 }
107
108 // Output = activation(Output) and update hidden_state
109 for (int k = 0; k < batch_size; k++) {
110 tensor_utils::ApplyActivationToVector(
111 output_ptr_batch + k * output_batch_leading_dim, num_units,
112 activation, output_ptr_batch + k * output_batch_leading_dim);
113 tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
114 num_units,
115 hidden_state_ptr_batch + k * num_units);
116 }
117 }
118 }
119
RnnBatchStep(const float * input_ptr_batch,const int8_t * input_weights_ptr,float input_weights_scale,const int8_t * recurrent_weights_ptr,float recurrent_weights_scale,const float * bias_ptr,int input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,int8_t * quantized_input_ptr_batch,int8_t * quantized_hidden_state_ptr_batch,float * scaling_factors,float * hidden_state_ptr_batch,float * output_ptr_batch)120 void RnnBatchStep(
121 const float* input_ptr_batch, const int8_t* input_weights_ptr,
122 float input_weights_scale, const int8_t* recurrent_weights_ptr,
123 float recurrent_weights_scale, const float* bias_ptr, int input_size,
124 int num_units, int batch_size, int output_batch_leading_dim,
125 TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
126 int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
127 float* hidden_state_ptr_batch, float* output_ptr_batch) {
128 RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale,
129 /*aux_input_ptr_batch=*/nullptr,
130 /*aux_input_weights_ptr=*/nullptr,
131 /*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr,
132 recurrent_weights_scale, bias_ptr, input_size,
133 /*aux_input_size=*/0, num_units, batch_size,
134 output_batch_leading_dim, activation, quantized_input_ptr_batch,
135 /*aux_quantized_input_ptr_batch=*/nullptr,
136 quantized_hidden_state_ptr_batch, scaling_factors,
137 hidden_state_ptr_batch, output_ptr_batch);
138 }
139
RnnBatchStep(const float * input_ptr_batch,const int8_t * input_weights_ptr,float input_weights_scale,const float * aux_input_ptr_batch,const int8_t * aux_input_weights_ptr,float aux_input_weights_scale,const int8_t * recurrent_weights_ptr,float recurrent_weights_scale,const float * bias_ptr,int input_size,int aux_input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,int8_t * quantized_input_ptr_batch,int8_t * aux_quantized_input_ptr_batch,int8_t * quantized_hidden_state_ptr_batch,float * scaling_factors,float * hidden_state_ptr_batch,float * output_ptr_batch)140 void RnnBatchStep(
141 const float* input_ptr_batch, const int8_t* input_weights_ptr,
142 float input_weights_scale, const float* aux_input_ptr_batch,
143 const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
144 const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
145 const float* bias_ptr, int input_size, int aux_input_size, int num_units,
146 int batch_size, int output_batch_leading_dim,
147 TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
148 int8_t* aux_quantized_input_ptr_batch,
149 int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
150 float* hidden_state_ptr_batch, float* output_ptr_batch) {
151 // Since the output batch rows may not be contiguous (output_batch_leading_dim
152 // != n_output), we unroll the batched operations where this is the case.
153 if (output_batch_leading_dim == num_units) {
154 // Output = bias
155 tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
156 output_ptr_batch);
157
158 // Save quantization and matmul computation for all zero input.
159 if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
160 // Quantize input from float to uint8 + quantization params (scaling
161 // factor).
162 float unused_min, unused_max;
163 // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
164 // whichever is faster.
165 for (int b = 0; b < batch_size; ++b) {
166 const int offset = b * input_size;
167 tensor_utils::SymmetricQuantizeFloats(
168 input_ptr_batch + offset, input_size,
169 quantized_input_ptr_batch + offset, &unused_min, &unused_max,
170 &scaling_factors[b]);
171 scaling_factors[b] *= input_weights_scale;
172 }
173
174 // Output += input * input_weights
175 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
176 input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
177 scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
178 }
179
180 if (aux_input_ptr_batch &&
181 !tensor_utils::IsZeroVector(aux_input_ptr_batch,
182 batch_size * aux_input_size)) {
183 float unused_min, unused_max;
184 for (int b = 0; b < batch_size; ++b) {
185 const int offset = b * aux_input_size;
186 tensor_utils::SymmetricQuantizeFloats(
187 aux_input_ptr_batch + offset, aux_input_size,
188 aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
189 &scaling_factors[b]);
190 scaling_factors[b] *= aux_input_weights_scale;
191 }
192
193 // Output += aux_input * aux_input_weights
194 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
195 aux_input_weights_ptr, num_units, aux_input_size,
196 aux_quantized_input_ptr_batch, scaling_factors, batch_size,
197 output_ptr_batch, /*result_stride=*/1);
198 }
199
200 // Save quantization and matmul computation for all zero input.
201 if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
202 batch_size * num_units)) {
203 // Quantize hidden_state
204 float unused_min, unused_max;
205 for (int b = 0; b < batch_size; ++b) {
206 const int offset = b * num_units;
207 tensor_utils::SymmetricQuantizeFloats(
208 hidden_state_ptr_batch + offset, num_units,
209 quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
210 &scaling_factors[b]);
211 scaling_factors[b] *= recurrent_weights_scale;
212 }
213
214 // Output += recurrent_weights * hidden_state
215 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
216 recurrent_weights_ptr, num_units, num_units,
217 quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
218 output_ptr_batch, /*result_stride=*/1);
219 }
220
221 // Output = activation(Output) and update hidden_state
222 tensor_utils::ApplyActivationToVector(
223 output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
224 tensor_utils::CopyVector(output_ptr_batch, num_units * batch_size,
225 hidden_state_ptr_batch);
226 } else {
227 // Output = bias
228 for (int k = 0; k < batch_size; k++) {
229 tensor_utils::CopyVector(bias_ptr, num_units,
230 output_ptr_batch + k * output_batch_leading_dim);
231 }
232
233 // Save quantization and matmul computation for all zero input.
234 if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
235 // Quantize input from float to uint8 + quantization params (scaling
236 // factor).
237 float unused_min, unused_max;
238 // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
239 // whichever is faster.
240 for (int b = 0; b < batch_size; ++b) {
241 const int offset = b * input_size;
242 tensor_utils::SymmetricQuantizeFloats(
243 input_ptr_batch + offset, input_size,
244 quantized_input_ptr_batch + offset, &unused_min, &unused_max,
245 &scaling_factors[b]);
246 scaling_factors[b] *= input_weights_scale;
247 }
248
249 // Output += input * input_weights
250 for (int k = 0; k < batch_size; k++) {
251 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
252 input_weights_ptr, num_units, input_size,
253 quantized_input_ptr_batch + k * input_size, &scaling_factors[k],
254 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
255 /*result_stride=*/1);
256 }
257 }
258
259 if (aux_input_ptr_batch &&
260 !tensor_utils::IsZeroVector(aux_input_ptr_batch,
261 batch_size * aux_input_size)) {
262 float unused_min, unused_max;
263 for (int b = 0; b < batch_size; ++b) {
264 const int offset = b * aux_input_size;
265 tensor_utils::SymmetricQuantizeFloats(
266 aux_input_ptr_batch + offset, aux_input_size,
267 aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
268 &scaling_factors[b]);
269 scaling_factors[b] *= aux_input_weights_scale;
270 }
271
272 // Output += aux_input * aux_input_weights
273 for (int k = 0; k < batch_size; k++) {
274 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
275 aux_input_weights_ptr, num_units, aux_input_size,
276 aux_quantized_input_ptr_batch + k * aux_input_size,
277 &scaling_factors[k],
278 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
279 /*result_stride=*/1);
280 }
281 }
282
283 // Save quantization and matmul computation for all zero input.
284 if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
285 batch_size * num_units)) {
286 // Quantize hidden_state
287 float unused_min, unused_max;
288 for (int b = 0; b < batch_size; ++b) {
289 const int offset = b * num_units;
290 tensor_utils::SymmetricQuantizeFloats(
291 hidden_state_ptr_batch + offset, num_units,
292 quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
293 &scaling_factors[b]);
294 scaling_factors[b] *= recurrent_weights_scale;
295 }
296
297 // Output += recurrent_weights * hidden_state
298 for (int k = 0; k < batch_size; k++) {
299 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
300 recurrent_weights_ptr, num_units, num_units,
301 quantized_hidden_state_ptr_batch + k * num_units,
302 &scaling_factors[k],
303 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
304 /*result_stride=*/1);
305 }
306 }
307
308 // Output = activation(Output) and update hidden_state
309 for (int k = 0; k < batch_size; k++) {
310 tensor_utils::ApplyActivationToVector(
311 output_ptr_batch + k * output_batch_leading_dim, num_units,
312 activation, output_ptr_batch + k * output_batch_leading_dim);
313 tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
314 num_units,
315 hidden_state_ptr_batch + k * num_units);
316 }
317 }
318 }
319
320 } // namespace kernel_utils
321 } // namespace tflite
322