1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
16
17 #include <algorithm>
18
19 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
20
21 namespace tflite {
22 namespace kernel_utils {
23
RnnBatchStep(const float * input_ptr_batch,const float * input_weights_ptr,const float * recurrent_weights_ptr,const float * bias_ptr,int input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,float * hidden_state_ptr_batch,float * output_ptr_batch)24 void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
25 const float* recurrent_weights_ptr, const float* bias_ptr,
26 int input_size, int num_units, int batch_size,
27 int output_batch_leading_dim,
28 TfLiteFusedActivation activation,
29 float* hidden_state_ptr_batch, float* output_ptr_batch) {
30 RnnBatchStep(input_ptr_batch, input_weights_ptr,
31 /*aux_input_ptr_batch=*/nullptr,
32 /*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr,
33 bias_ptr, input_size, /*aux_input_size=*/0, num_units,
34 batch_size, output_batch_leading_dim, activation,
35 hidden_state_ptr_batch, output_ptr_batch);
36 }
37
RnnBatchStep(const float * input_ptr_batch,const float * input_weights_ptr,const float * aux_input_ptr_batch,const float * aux_input_weights_ptr,const float * recurrent_weights_ptr,const float * bias_ptr,int input_size,int aux_input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,float * hidden_state_ptr_batch,float * output_ptr_batch)38 void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
39 const float* aux_input_ptr_batch,
40 const float* aux_input_weights_ptr,
41 const float* recurrent_weights_ptr, const float* bias_ptr,
42 int input_size, int aux_input_size, int num_units,
43 int batch_size, int output_batch_leading_dim,
44 TfLiteFusedActivation activation,
45 float* hidden_state_ptr_batch, float* output_ptr_batch) {
46 // Since the output batch rows may not be contiguous (output_batch_leading_dim
47 // != n_output), we unroll the batched operations where this is the case.
48 if (output_batch_leading_dim == num_units) {
49 // Output = bias
50 tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
51 output_ptr_batch);
52
53 // Output += input * input_weights
54 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
55 input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
56 output_ptr_batch, /*result_stride=*/1);
57
58 // Output += aux_input * aux_input_weights (if they are not empty).
59 if (aux_input_size > 0) {
60 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
61 aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
62 batch_size, output_ptr_batch, /*result_stride=*/1);
63 }
64
65 // Output += recurrent_weights * hidden_state
66 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
67 recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
68 batch_size, output_ptr_batch, /*result_stride=*/1);
69
70 // Output = activation(Output) and update hidden_state
71 tensor_utils::ApplyActivationToVector(
72 output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
73 std::copy_n(output_ptr_batch, num_units * batch_size,
74 hidden_state_ptr_batch);
75 } else {
76 // Output = bias
77 for (int k = 0; k < batch_size; k++) {
78 std::copy_n(bias_ptr, num_units,
79 output_ptr_batch + k * output_batch_leading_dim);
80 }
81
82 // Output += input * input_weights
83 for (int k = 0; k < batch_size; k++) {
84 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
85 input_weights_ptr, num_units, input_size,
86 input_ptr_batch + k * input_size, /*n_batch=*/1,
87 output_ptr_batch + k * output_batch_leading_dim, /*result_stride=*/1);
88 }
89
90 // Output += aux_input * aux_input_weights (if they are not empty).
91 if (aux_input_size > 0) {
92 for (int k = 0; k < batch_size; k++) {
93 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
94 aux_input_weights_ptr, num_units, aux_input_size,
95 aux_input_ptr_batch + k * aux_input_size,
96 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
97 /*result_stride=*/1);
98 }
99 }
100
101 // Output += recurrent_weights * hidden_state
102 for (int k = 0; k < batch_size; k++) {
103 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
104 recurrent_weights_ptr, num_units, num_units,
105 hidden_state_ptr_batch + k * num_units,
106 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
107 /*result_stride=*/1);
108 }
109
110 // Output = activation(Output) and update hidden_state
111 for (int k = 0; k < batch_size; k++) {
112 tensor_utils::ApplyActivationToVector(
113 output_ptr_batch + k * output_batch_leading_dim, num_units,
114 activation, output_ptr_batch + k * output_batch_leading_dim);
115 std::copy_n(output_ptr_batch + k * output_batch_leading_dim, num_units,
116 hidden_state_ptr_batch + k * num_units);
117 }
118 }
119 }
120
RnnBatchStep(const float * input_ptr_batch,const int8_t * input_weights_ptr,float input_weights_scale,const int8_t * recurrent_weights_ptr,float recurrent_weights_scale,const float * bias_ptr,int input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,int8_t * quantized_input_ptr_batch,int8_t * quantized_hidden_state_ptr_batch,float * scaling_factors,float * hidden_state_ptr_batch,float * output_ptr_batch)121 void RnnBatchStep(
122 const float* input_ptr_batch, const int8_t* input_weights_ptr,
123 float input_weights_scale, const int8_t* recurrent_weights_ptr,
124 float recurrent_weights_scale, const float* bias_ptr, int input_size,
125 int num_units, int batch_size, int output_batch_leading_dim,
126 TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
127 int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
128 float* hidden_state_ptr_batch, float* output_ptr_batch) {
129 RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale,
130 /*aux_input_ptr_batch=*/nullptr,
131 /*aux_input_weights_ptr=*/nullptr,
132 /*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr,
133 recurrent_weights_scale, bias_ptr, input_size,
134 /*aux_input_size=*/0, num_units, batch_size,
135 output_batch_leading_dim, activation, quantized_input_ptr_batch,
136 /*aux_quantized_input_ptr_batch=*/nullptr,
137 quantized_hidden_state_ptr_batch, scaling_factors,
138 hidden_state_ptr_batch, output_ptr_batch);
139 }
140
RnnBatchStep(const float * input_ptr_batch,const int8_t * input_weights_ptr,float input_weights_scale,const float * aux_input_ptr_batch,const int8_t * aux_input_weights_ptr,float aux_input_weights_scale,const int8_t * recurrent_weights_ptr,float recurrent_weights_scale,const float * bias_ptr,int input_size,int aux_input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,int8_t * quantized_input_ptr_batch,int8_t * aux_quantized_input_ptr_batch,int8_t * quantized_hidden_state_ptr_batch,float * scaling_factors,float * hidden_state_ptr_batch,float * output_ptr_batch)141 void RnnBatchStep(
142 const float* input_ptr_batch, const int8_t* input_weights_ptr,
143 float input_weights_scale, const float* aux_input_ptr_batch,
144 const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
145 const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
146 const float* bias_ptr, int input_size, int aux_input_size, int num_units,
147 int batch_size, int output_batch_leading_dim,
148 TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
149 int8_t* aux_quantized_input_ptr_batch,
150 int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
151 float* hidden_state_ptr_batch, float* output_ptr_batch) {
152 // Since the output batch rows may not be contiguous (output_batch_leading_dim
153 // != n_output), we unroll the batched operations where this is the case.
154 if (output_batch_leading_dim == num_units) {
155 // Output = bias
156 tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
157 output_ptr_batch);
158
159 // Save quantization and matmul computation for all zero input.
160 if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
161 // Quantize input from float to uint8 + quantization params (scaling
162 // factor).
163 float unused_min, unused_max;
164 // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
165 // whichever is faster.
166 for (int b = 0; b < batch_size; ++b) {
167 const int offset = b * input_size;
168 tensor_utils::SymmetricQuantizeFloats(
169 input_ptr_batch + offset, input_size,
170 quantized_input_ptr_batch + offset, &unused_min, &unused_max,
171 &scaling_factors[b]);
172 scaling_factors[b] *= input_weights_scale;
173 }
174
175 // Output += input * input_weights
176 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
177 input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
178 scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
179 }
180
181 if (aux_input_ptr_batch &&
182 !tensor_utils::IsZeroVector(aux_input_ptr_batch,
183 batch_size * aux_input_size)) {
184 float unused_min, unused_max;
185 for (int b = 0; b < batch_size; ++b) {
186 const int offset = b * aux_input_size;
187 tensor_utils::SymmetricQuantizeFloats(
188 aux_input_ptr_batch + offset, aux_input_size,
189 aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
190 &scaling_factors[b]);
191 scaling_factors[b] *= aux_input_weights_scale;
192 }
193
194 // Output += aux_input * aux_input_weights
195 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
196 aux_input_weights_ptr, num_units, aux_input_size,
197 aux_quantized_input_ptr_batch, scaling_factors, batch_size,
198 output_ptr_batch, /*result_stride=*/1);
199 }
200
201 // Save quantization and matmul computation for all zero input.
202 if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
203 batch_size * num_units)) {
204 // Quantize hidden_state
205 float unused_min, unused_max;
206 for (int b = 0; b < batch_size; ++b) {
207 const int offset = b * num_units;
208 tensor_utils::SymmetricQuantizeFloats(
209 hidden_state_ptr_batch + offset, num_units,
210 quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
211 &scaling_factors[b]);
212 scaling_factors[b] *= recurrent_weights_scale;
213 }
214
215 // Output += recurrent_weights * hidden_state
216 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
217 recurrent_weights_ptr, num_units, num_units,
218 quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
219 output_ptr_batch, /*result_stride=*/1);
220 }
221
222 // Output = activation(Output) and update hidden_state
223 tensor_utils::ApplyActivationToVector(
224 output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
225 std::copy_n(output_ptr_batch, num_units * batch_size,
226 hidden_state_ptr_batch);
227 } else {
228 // Output = bias
229 for (int k = 0; k < batch_size; k++) {
230 std::copy_n(bias_ptr, num_units,
231 output_ptr_batch + k * output_batch_leading_dim);
232 }
233
234 // Save quantization and matmul computation for all zero input.
235 if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
236 // Quantize input from float to uint8 + quantization params (scaling
237 // factor).
238 float unused_min, unused_max;
239 // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
240 // whichever is faster.
241 for (int b = 0; b < batch_size; ++b) {
242 const int offset = b * input_size;
243 tensor_utils::SymmetricQuantizeFloats(
244 input_ptr_batch + offset, input_size,
245 quantized_input_ptr_batch + offset, &unused_min, &unused_max,
246 &scaling_factors[b]);
247 scaling_factors[b] *= input_weights_scale;
248 }
249
250 // Output += input * input_weights
251 for (int k = 0; k < batch_size; k++) {
252 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
253 input_weights_ptr, num_units, input_size,
254 quantized_input_ptr_batch + k * input_size, &scaling_factors[k],
255 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
256 /*result_stride=*/1);
257 }
258 }
259
260 if (aux_input_ptr_batch &&
261 !tensor_utils::IsZeroVector(aux_input_ptr_batch,
262 batch_size * aux_input_size)) {
263 float unused_min, unused_max;
264 for (int b = 0; b < batch_size; ++b) {
265 const int offset = b * aux_input_size;
266 tensor_utils::SymmetricQuantizeFloats(
267 aux_input_ptr_batch + offset, aux_input_size,
268 aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
269 &scaling_factors[b]);
270 scaling_factors[b] *= aux_input_weights_scale;
271 }
272
273 // Output += aux_input * aux_input_weights
274 for (int k = 0; k < batch_size; k++) {
275 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
276 aux_input_weights_ptr, num_units, aux_input_size,
277 aux_quantized_input_ptr_batch + k * aux_input_size,
278 &scaling_factors[k],
279 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
280 /*result_stride=*/1);
281 }
282 }
283
284 // Save quantization and matmul computation for all zero input.
285 if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
286 batch_size * num_units)) {
287 // Quantize hidden_state
288 float unused_min, unused_max;
289 for (int b = 0; b < batch_size; ++b) {
290 const int offset = b * num_units;
291 tensor_utils::SymmetricQuantizeFloats(
292 hidden_state_ptr_batch + offset, num_units,
293 quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
294 &scaling_factors[b]);
295 scaling_factors[b] *= recurrent_weights_scale;
296 }
297
298 // Output += recurrent_weights * hidden_state
299 for (int k = 0; k < batch_size; k++) {
300 tensor_utils::MatrixBatchVectorMultiplyAccumulate(
301 recurrent_weights_ptr, num_units, num_units,
302 quantized_hidden_state_ptr_batch + k * num_units,
303 &scaling_factors[k],
304 /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
305 /*result_stride=*/1);
306 }
307 }
308
309 // Output = activation(Output) and update hidden_state
310 for (int k = 0; k < batch_size; k++) {
311 tensor_utils::ApplyActivationToVector(
312 output_ptr_batch + k * output_batch_leading_dim, num_units,
313 activation, output_ptr_batch + k * output_batch_leading_dim);
314 std::copy_n(output_ptr_batch + k * output_batch_leading_dim, num_units,
315 hidden_state_ptr_batch + k * num_units);
316 }
317 }
318 }
319
320 } // namespace kernel_utils
321 } // namespace tflite
322