• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
16 
17 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
18 
19 namespace tflite {
20 namespace kernel_utils {
21 
RnnBatchStep(const float * input_ptr_batch,const float * input_weights_ptr,const float * recurrent_weights_ptr,const float * bias_ptr,int input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,float * hidden_state_ptr_batch,float * output_ptr_batch)22 void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
23                   const float* recurrent_weights_ptr, const float* bias_ptr,
24                   int input_size, int num_units, int batch_size,
25                   int output_batch_leading_dim,
26                   TfLiteFusedActivation activation,
27                   float* hidden_state_ptr_batch, float* output_ptr_batch) {
28   RnnBatchStep(input_ptr_batch, input_weights_ptr,
29                /*aux_input_ptr_batch=*/nullptr,
30                /*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr,
31                bias_ptr, input_size, /*aux_input_size=*/0, num_units,
32                batch_size, output_batch_leading_dim, activation,
33                hidden_state_ptr_batch, output_ptr_batch);
34 }
35 
RnnBatchStep(const float * input_ptr_batch,const float * input_weights_ptr,const float * aux_input_ptr_batch,const float * aux_input_weights_ptr,const float * recurrent_weights_ptr,const float * bias_ptr,int input_size,int aux_input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,float * hidden_state_ptr_batch,float * output_ptr_batch)36 void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
37                   const float* aux_input_ptr_batch,
38                   const float* aux_input_weights_ptr,
39                   const float* recurrent_weights_ptr, const float* bias_ptr,
40                   int input_size, int aux_input_size, int num_units,
41                   int batch_size, int output_batch_leading_dim,
42                   TfLiteFusedActivation activation,
43                   float* hidden_state_ptr_batch, float* output_ptr_batch) {
44   // Since the output batch rows may not be contiguous (output_batch_leading_dim
45   // != n_output), we unroll the batched operations where this is the case.
46   if (output_batch_leading_dim == num_units) {
47     // Output = bias
48     tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
49                                           output_ptr_batch);
50 
51     // Output += input * input_weights
52     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
53         input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
54         output_ptr_batch, /*result_stride=*/1);
55 
56     // Output += aux_input * aux_input_weights (if they are not empty).
57     if (aux_input_size > 0) {
58       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
59           aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
60           batch_size, output_ptr_batch, /*result_stride=*/1);
61     }
62 
63     // Output += recurrent_weights * hidden_state
64     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
65         recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
66         batch_size, output_ptr_batch, /*result_stride=*/1);
67 
68     // Output = activation(Output) and update hidden_state
69     tensor_utils::ApplyActivationToVector(
70         output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
71     tensor_utils::CopyVector(output_ptr_batch, num_units * batch_size,
72                              hidden_state_ptr_batch);
73   } else {
74     // Output = bias
75     for (int k = 0; k < batch_size; k++) {
76       tensor_utils::CopyVector(bias_ptr, num_units,
77                                output_ptr_batch + k * output_batch_leading_dim);
78     }
79 
80     // Output += input * input_weights
81     for (int k = 0; k < batch_size; k++) {
82       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
83           input_weights_ptr, num_units, input_size,
84           input_ptr_batch + k * input_size, /*n_batch=*/1,
85           output_ptr_batch + k * output_batch_leading_dim, /*result_stride=*/1);
86     }
87 
88     // Output += aux_input * aux_input_weights (if they are not empty).
89     if (aux_input_size > 0) {
90       for (int k = 0; k < batch_size; k++) {
91         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
92             aux_input_weights_ptr, num_units, aux_input_size,
93             aux_input_ptr_batch + k * aux_input_size,
94             /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
95             /*result_stride=*/1);
96       }
97     }
98 
99     // Output += recurrent_weights * hidden_state
100     for (int k = 0; k < batch_size; k++) {
101       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
102           recurrent_weights_ptr, num_units, num_units,
103           hidden_state_ptr_batch + k * num_units,
104           /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
105           /*result_stride=*/1);
106     }
107 
108     // Output = activation(Output) and update hidden_state
109     for (int k = 0; k < batch_size; k++) {
110       tensor_utils::ApplyActivationToVector(
111           output_ptr_batch + k * output_batch_leading_dim, num_units,
112           activation, output_ptr_batch + k * output_batch_leading_dim);
113       tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
114                                num_units,
115                                hidden_state_ptr_batch + k * num_units);
116     }
117   }
118 }
119 
RnnBatchStep(const float * input_ptr_batch,const int8_t * input_weights_ptr,float input_weights_scale,const int8_t * recurrent_weights_ptr,float recurrent_weights_scale,const float * bias_ptr,int input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,int8_t * quantized_input_ptr_batch,int8_t * quantized_hidden_state_ptr_batch,float * scaling_factors,float * hidden_state_ptr_batch,float * output_ptr_batch)120 void RnnBatchStep(
121     const float* input_ptr_batch, const int8_t* input_weights_ptr,
122     float input_weights_scale, const int8_t* recurrent_weights_ptr,
123     float recurrent_weights_scale, const float* bias_ptr, int input_size,
124     int num_units, int batch_size, int output_batch_leading_dim,
125     TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
126     int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
127     float* hidden_state_ptr_batch, float* output_ptr_batch) {
128   RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale,
129                /*aux_input_ptr_batch=*/nullptr,
130                /*aux_input_weights_ptr=*/nullptr,
131                /*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr,
132                recurrent_weights_scale, bias_ptr, input_size,
133                /*aux_input_size=*/0, num_units, batch_size,
134                output_batch_leading_dim, activation, quantized_input_ptr_batch,
135                /*aux_quantized_input_ptr_batch=*/nullptr,
136                quantized_hidden_state_ptr_batch, scaling_factors,
137                hidden_state_ptr_batch, output_ptr_batch);
138 }
139 
RnnBatchStep(const float * input_ptr_batch,const int8_t * input_weights_ptr,float input_weights_scale,const float * aux_input_ptr_batch,const int8_t * aux_input_weights_ptr,float aux_input_weights_scale,const int8_t * recurrent_weights_ptr,float recurrent_weights_scale,const float * bias_ptr,int input_size,int aux_input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,int8_t * quantized_input_ptr_batch,int8_t * aux_quantized_input_ptr_batch,int8_t * quantized_hidden_state_ptr_batch,float * scaling_factors,float * hidden_state_ptr_batch,float * output_ptr_batch)140 void RnnBatchStep(
141     const float* input_ptr_batch, const int8_t* input_weights_ptr,
142     float input_weights_scale, const float* aux_input_ptr_batch,
143     const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
144     const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
145     const float* bias_ptr, int input_size, int aux_input_size, int num_units,
146     int batch_size, int output_batch_leading_dim,
147     TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
148     int8_t* aux_quantized_input_ptr_batch,
149     int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
150     float* hidden_state_ptr_batch, float* output_ptr_batch) {
151   // Since the output batch rows may not be contiguous (output_batch_leading_dim
152   // != n_output), we unroll the batched operations where this is the case.
153   if (output_batch_leading_dim == num_units) {
154     // Output = bias
155     tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
156                                           output_ptr_batch);
157 
158     // Save quantization and matmul computation for all zero input.
159     if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
160       // Quantize input from float to uint8 + quantization params (scaling
161       // factor).
162       float unused_min, unused_max;
163       // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
164       // whichever is faster.
165       for (int b = 0; b < batch_size; ++b) {
166         const int offset = b * input_size;
167         tensor_utils::SymmetricQuantizeFloats(
168             input_ptr_batch + offset, input_size,
169             quantized_input_ptr_batch + offset, &unused_min, &unused_max,
170             &scaling_factors[b]);
171         scaling_factors[b] *= input_weights_scale;
172       }
173 
174       // Output += input * input_weights
175       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
176           input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
177           scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
178     }
179 
180     if (aux_input_ptr_batch &&
181         !tensor_utils::IsZeroVector(aux_input_ptr_batch,
182                                     batch_size * aux_input_size)) {
183       float unused_min, unused_max;
184       for (int b = 0; b < batch_size; ++b) {
185         const int offset = b * aux_input_size;
186         tensor_utils::SymmetricQuantizeFloats(
187             aux_input_ptr_batch + offset, aux_input_size,
188             aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
189             &scaling_factors[b]);
190         scaling_factors[b] *= aux_input_weights_scale;
191       }
192 
193       // Output += aux_input * aux_input_weights
194       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
195           aux_input_weights_ptr, num_units, aux_input_size,
196           aux_quantized_input_ptr_batch, scaling_factors, batch_size,
197           output_ptr_batch, /*result_stride=*/1);
198     }
199 
200     // Save quantization and matmul computation for all zero input.
201     if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
202                                     batch_size * num_units)) {
203       // Quantize hidden_state
204       float unused_min, unused_max;
205       for (int b = 0; b < batch_size; ++b) {
206         const int offset = b * num_units;
207         tensor_utils::SymmetricQuantizeFloats(
208             hidden_state_ptr_batch + offset, num_units,
209             quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
210             &scaling_factors[b]);
211         scaling_factors[b] *= recurrent_weights_scale;
212       }
213 
214       // Output += recurrent_weights * hidden_state
215       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
216           recurrent_weights_ptr, num_units, num_units,
217           quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
218           output_ptr_batch, /*result_stride=*/1);
219     }
220 
221     // Output = activation(Output) and update hidden_state
222     tensor_utils::ApplyActivationToVector(
223         output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
224     tensor_utils::CopyVector(output_ptr_batch, num_units * batch_size,
225                              hidden_state_ptr_batch);
226   } else {
227     // Output = bias
228     for (int k = 0; k < batch_size; k++) {
229       tensor_utils::CopyVector(bias_ptr, num_units,
230                                output_ptr_batch + k * output_batch_leading_dim);
231     }
232 
233     // Save quantization and matmul computation for all zero input.
234     if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
235       // Quantize input from float to uint8 + quantization params (scaling
236       // factor).
237       float unused_min, unused_max;
238       // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
239       // whichever is faster.
240       for (int b = 0; b < batch_size; ++b) {
241         const int offset = b * input_size;
242         tensor_utils::SymmetricQuantizeFloats(
243             input_ptr_batch + offset, input_size,
244             quantized_input_ptr_batch + offset, &unused_min, &unused_max,
245             &scaling_factors[b]);
246         scaling_factors[b] *= input_weights_scale;
247       }
248 
249       // Output += input * input_weights
250       for (int k = 0; k < batch_size; k++) {
251         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
252             input_weights_ptr, num_units, input_size,
253             quantized_input_ptr_batch + k * input_size, &scaling_factors[k],
254             /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
255             /*result_stride=*/1);
256       }
257     }
258 
259     if (aux_input_ptr_batch &&
260         !tensor_utils::IsZeroVector(aux_input_ptr_batch,
261                                     batch_size * aux_input_size)) {
262       float unused_min, unused_max;
263       for (int b = 0; b < batch_size; ++b) {
264         const int offset = b * aux_input_size;
265         tensor_utils::SymmetricQuantizeFloats(
266             aux_input_ptr_batch + offset, aux_input_size,
267             aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
268             &scaling_factors[b]);
269         scaling_factors[b] *= aux_input_weights_scale;
270       }
271 
272       // Output += aux_input * aux_input_weights
273       for (int k = 0; k < batch_size; k++) {
274         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
275             aux_input_weights_ptr, num_units, aux_input_size,
276             aux_quantized_input_ptr_batch + k * aux_input_size,
277             &scaling_factors[k],
278             /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
279             /*result_stride=*/1);
280       }
281     }
282 
283     // Save quantization and matmul computation for all zero input.
284     if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
285                                     batch_size * num_units)) {
286       // Quantize hidden_state
287       float unused_min, unused_max;
288       for (int b = 0; b < batch_size; ++b) {
289         const int offset = b * num_units;
290         tensor_utils::SymmetricQuantizeFloats(
291             hidden_state_ptr_batch + offset, num_units,
292             quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
293             &scaling_factors[b]);
294         scaling_factors[b] *= recurrent_weights_scale;
295       }
296 
297       // Output += recurrent_weights * hidden_state
298       for (int k = 0; k < batch_size; k++) {
299         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
300             recurrent_weights_ptr, num_units, num_units,
301             quantized_hidden_state_ptr_batch + k * num_units,
302             &scaling_factors[k],
303             /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
304             /*result_stride=*/1);
305       }
306     }
307 
308     // Output = activation(Output) and update hidden_state
309     for (int k = 0; k < batch_size; k++) {
310       tensor_utils::ApplyActivationToVector(
311           output_ptr_batch + k * output_batch_leading_dim, num_units,
312           activation, output_ptr_batch + k * output_batch_leading_dim);
313       tensor_utils::CopyVector(output_ptr_batch + k * output_batch_leading_dim,
314                                num_units,
315                                hidden_state_ptr_batch + k * num_units);
316     }
317   }
318 }
319 
320 }  // namespace kernel_utils
321 }  // namespace tflite
322