• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
16 
17 #include <algorithm>
18 
19 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
20 
21 namespace tflite {
22 namespace kernel_utils {
23 
RnnBatchStep(const float * input_ptr_batch,const float * input_weights_ptr,const float * recurrent_weights_ptr,const float * bias_ptr,int input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,float * hidden_state_ptr_batch,float * output_ptr_batch)24 void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
25                   const float* recurrent_weights_ptr, const float* bias_ptr,
26                   int input_size, int num_units, int batch_size,
27                   int output_batch_leading_dim,
28                   TfLiteFusedActivation activation,
29                   float* hidden_state_ptr_batch, float* output_ptr_batch) {
30   RnnBatchStep(input_ptr_batch, input_weights_ptr,
31                /*aux_input_ptr_batch=*/nullptr,
32                /*aux_input_weights_ptr=*/nullptr, recurrent_weights_ptr,
33                bias_ptr, input_size, /*aux_input_size=*/0, num_units,
34                batch_size, output_batch_leading_dim, activation,
35                hidden_state_ptr_batch, output_ptr_batch);
36 }
37 
RnnBatchStep(const float * input_ptr_batch,const float * input_weights_ptr,const float * aux_input_ptr_batch,const float * aux_input_weights_ptr,const float * recurrent_weights_ptr,const float * bias_ptr,int input_size,int aux_input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,float * hidden_state_ptr_batch,float * output_ptr_batch)38 void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
39                   const float* aux_input_ptr_batch,
40                   const float* aux_input_weights_ptr,
41                   const float* recurrent_weights_ptr, const float* bias_ptr,
42                   int input_size, int aux_input_size, int num_units,
43                   int batch_size, int output_batch_leading_dim,
44                   TfLiteFusedActivation activation,
45                   float* hidden_state_ptr_batch, float* output_ptr_batch) {
46   // Since the output batch rows may not be contiguous (output_batch_leading_dim
47   // != n_output), we unroll the batched operations where this is the case.
48   if (output_batch_leading_dim == num_units) {
49     // Output = bias
50     tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
51                                           output_ptr_batch);
52 
53     // Output += input * input_weights
54     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
55         input_weights_ptr, num_units, input_size, input_ptr_batch, batch_size,
56         output_ptr_batch, /*result_stride=*/1);
57 
58     // Output += aux_input * aux_input_weights (if they are not empty).
59     if (aux_input_size > 0) {
60       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
61           aux_input_weights_ptr, num_units, aux_input_size, aux_input_ptr_batch,
62           batch_size, output_ptr_batch, /*result_stride=*/1);
63     }
64 
65     // Output += recurrent_weights * hidden_state
66     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
67         recurrent_weights_ptr, num_units, num_units, hidden_state_ptr_batch,
68         batch_size, output_ptr_batch, /*result_stride=*/1);
69 
70     // Output = activation(Output) and update hidden_state
71     tensor_utils::ApplyActivationToVector(
72         output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
73     std::copy_n(output_ptr_batch, num_units * batch_size,
74                 hidden_state_ptr_batch);
75   } else {
76     // Output = bias
77     for (int k = 0; k < batch_size; k++) {
78       std::copy_n(bias_ptr, num_units,
79                   output_ptr_batch + k * output_batch_leading_dim);
80     }
81 
82     // Output += input * input_weights
83     for (int k = 0; k < batch_size; k++) {
84       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
85           input_weights_ptr, num_units, input_size,
86           input_ptr_batch + k * input_size, /*n_batch=*/1,
87           output_ptr_batch + k * output_batch_leading_dim, /*result_stride=*/1);
88     }
89 
90     // Output += aux_input * aux_input_weights (if they are not empty).
91     if (aux_input_size > 0) {
92       for (int k = 0; k < batch_size; k++) {
93         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
94             aux_input_weights_ptr, num_units, aux_input_size,
95             aux_input_ptr_batch + k * aux_input_size,
96             /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
97             /*result_stride=*/1);
98       }
99     }
100 
101     // Output += recurrent_weights * hidden_state
102     for (int k = 0; k < batch_size; k++) {
103       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
104           recurrent_weights_ptr, num_units, num_units,
105           hidden_state_ptr_batch + k * num_units,
106           /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
107           /*result_stride=*/1);
108     }
109 
110     // Output = activation(Output) and update hidden_state
111     for (int k = 0; k < batch_size; k++) {
112       tensor_utils::ApplyActivationToVector(
113           output_ptr_batch + k * output_batch_leading_dim, num_units,
114           activation, output_ptr_batch + k * output_batch_leading_dim);
115       std::copy_n(output_ptr_batch + k * output_batch_leading_dim, num_units,
116                   hidden_state_ptr_batch + k * num_units);
117     }
118   }
119 }
120 
RnnBatchStep(const float * input_ptr_batch,const int8_t * input_weights_ptr,float input_weights_scale,const int8_t * recurrent_weights_ptr,float recurrent_weights_scale,const float * bias_ptr,int input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,int8_t * quantized_input_ptr_batch,int8_t * quantized_hidden_state_ptr_batch,float * scaling_factors,float * hidden_state_ptr_batch,float * output_ptr_batch)121 void RnnBatchStep(
122     const float* input_ptr_batch, const int8_t* input_weights_ptr,
123     float input_weights_scale, const int8_t* recurrent_weights_ptr,
124     float recurrent_weights_scale, const float* bias_ptr, int input_size,
125     int num_units, int batch_size, int output_batch_leading_dim,
126     TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
127     int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
128     float* hidden_state_ptr_batch, float* output_ptr_batch) {
129   RnnBatchStep(input_ptr_batch, input_weights_ptr, input_weights_scale,
130                /*aux_input_ptr_batch=*/nullptr,
131                /*aux_input_weights_ptr=*/nullptr,
132                /*aux_input_weights_scale=*/0.0f, recurrent_weights_ptr,
133                recurrent_weights_scale, bias_ptr, input_size,
134                /*aux_input_size=*/0, num_units, batch_size,
135                output_batch_leading_dim, activation, quantized_input_ptr_batch,
136                /*aux_quantized_input_ptr_batch=*/nullptr,
137                quantized_hidden_state_ptr_batch, scaling_factors,
138                hidden_state_ptr_batch, output_ptr_batch);
139 }
140 
RnnBatchStep(const float * input_ptr_batch,const int8_t * input_weights_ptr,float input_weights_scale,const float * aux_input_ptr_batch,const int8_t * aux_input_weights_ptr,float aux_input_weights_scale,const int8_t * recurrent_weights_ptr,float recurrent_weights_scale,const float * bias_ptr,int input_size,int aux_input_size,int num_units,int batch_size,int output_batch_leading_dim,TfLiteFusedActivation activation,int8_t * quantized_input_ptr_batch,int8_t * aux_quantized_input_ptr_batch,int8_t * quantized_hidden_state_ptr_batch,float * scaling_factors,float * hidden_state_ptr_batch,float * output_ptr_batch)141 void RnnBatchStep(
142     const float* input_ptr_batch, const int8_t* input_weights_ptr,
143     float input_weights_scale, const float* aux_input_ptr_batch,
144     const int8_t* aux_input_weights_ptr, float aux_input_weights_scale,
145     const int8_t* recurrent_weights_ptr, float recurrent_weights_scale,
146     const float* bias_ptr, int input_size, int aux_input_size, int num_units,
147     int batch_size, int output_batch_leading_dim,
148     TfLiteFusedActivation activation, int8_t* quantized_input_ptr_batch,
149     int8_t* aux_quantized_input_ptr_batch,
150     int8_t* quantized_hidden_state_ptr_batch, float* scaling_factors,
151     float* hidden_state_ptr_batch, float* output_ptr_batch) {
152   // Since the output batch rows may not be contiguous (output_batch_leading_dim
153   // != n_output), we unroll the batched operations where this is the case.
154   if (output_batch_leading_dim == num_units) {
155     // Output = bias
156     tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
157                                           output_ptr_batch);
158 
159     // Save quantization and matmul computation for all zero input.
160     if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
161       // Quantize input from float to uint8 + quantization params (scaling
162       // factor).
163       float unused_min, unused_max;
164       // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
165       // whichever is faster.
166       for (int b = 0; b < batch_size; ++b) {
167         const int offset = b * input_size;
168         tensor_utils::SymmetricQuantizeFloats(
169             input_ptr_batch + offset, input_size,
170             quantized_input_ptr_batch + offset, &unused_min, &unused_max,
171             &scaling_factors[b]);
172         scaling_factors[b] *= input_weights_scale;
173       }
174 
175       // Output += input * input_weights
176       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
177           input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
178           scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
179     }
180 
181     if (aux_input_ptr_batch &&
182         !tensor_utils::IsZeroVector(aux_input_ptr_batch,
183                                     batch_size * aux_input_size)) {
184       float unused_min, unused_max;
185       for (int b = 0; b < batch_size; ++b) {
186         const int offset = b * aux_input_size;
187         tensor_utils::SymmetricQuantizeFloats(
188             aux_input_ptr_batch + offset, aux_input_size,
189             aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
190             &scaling_factors[b]);
191         scaling_factors[b] *= aux_input_weights_scale;
192       }
193 
194       // Output += aux_input * aux_input_weights
195       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
196           aux_input_weights_ptr, num_units, aux_input_size,
197           aux_quantized_input_ptr_batch, scaling_factors, batch_size,
198           output_ptr_batch, /*result_stride=*/1);
199     }
200 
201     // Save quantization and matmul computation for all zero input.
202     if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
203                                     batch_size * num_units)) {
204       // Quantize hidden_state
205       float unused_min, unused_max;
206       for (int b = 0; b < batch_size; ++b) {
207         const int offset = b * num_units;
208         tensor_utils::SymmetricQuantizeFloats(
209             hidden_state_ptr_batch + offset, num_units,
210             quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
211             &scaling_factors[b]);
212         scaling_factors[b] *= recurrent_weights_scale;
213       }
214 
215       // Output += recurrent_weights * hidden_state
216       tensor_utils::MatrixBatchVectorMultiplyAccumulate(
217           recurrent_weights_ptr, num_units, num_units,
218           quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
219           output_ptr_batch, /*result_stride=*/1);
220     }
221 
222     // Output = activation(Output) and update hidden_state
223     tensor_utils::ApplyActivationToVector(
224         output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
225     std::copy_n(output_ptr_batch, num_units * batch_size,
226                 hidden_state_ptr_batch);
227   } else {
228     // Output = bias
229     for (int k = 0; k < batch_size; k++) {
230       std::copy_n(bias_ptr, num_units,
231                   output_ptr_batch + k * output_batch_leading_dim);
232     }
233 
234     // Save quantization and matmul computation for all zero input.
235     if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
236       // Quantize input from float to uint8 + quantization params (scaling
237       // factor).
238       float unused_min, unused_max;
239       // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
240       // whichever is faster.
241       for (int b = 0; b < batch_size; ++b) {
242         const int offset = b * input_size;
243         tensor_utils::SymmetricQuantizeFloats(
244             input_ptr_batch + offset, input_size,
245             quantized_input_ptr_batch + offset, &unused_min, &unused_max,
246             &scaling_factors[b]);
247         scaling_factors[b] *= input_weights_scale;
248       }
249 
250       // Output += input * input_weights
251       for (int k = 0; k < batch_size; k++) {
252         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
253             input_weights_ptr, num_units, input_size,
254             quantized_input_ptr_batch + k * input_size, &scaling_factors[k],
255             /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
256             /*result_stride=*/1);
257       }
258     }
259 
260     if (aux_input_ptr_batch &&
261         !tensor_utils::IsZeroVector(aux_input_ptr_batch,
262                                     batch_size * aux_input_size)) {
263       float unused_min, unused_max;
264       for (int b = 0; b < batch_size; ++b) {
265         const int offset = b * aux_input_size;
266         tensor_utils::SymmetricQuantizeFloats(
267             aux_input_ptr_batch + offset, aux_input_size,
268             aux_quantized_input_ptr_batch + offset, &unused_min, &unused_max,
269             &scaling_factors[b]);
270         scaling_factors[b] *= aux_input_weights_scale;
271       }
272 
273       // Output += aux_input * aux_input_weights
274       for (int k = 0; k < batch_size; k++) {
275         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
276             aux_input_weights_ptr, num_units, aux_input_size,
277             aux_quantized_input_ptr_batch + k * aux_input_size,
278             &scaling_factors[k],
279             /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
280             /*result_stride=*/1);
281       }
282     }
283 
284     // Save quantization and matmul computation for all zero input.
285     if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
286                                     batch_size * num_units)) {
287       // Quantize hidden_state
288       float unused_min, unused_max;
289       for (int b = 0; b < batch_size; ++b) {
290         const int offset = b * num_units;
291         tensor_utils::SymmetricQuantizeFloats(
292             hidden_state_ptr_batch + offset, num_units,
293             quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
294             &scaling_factors[b]);
295         scaling_factors[b] *= recurrent_weights_scale;
296       }
297 
298       // Output += recurrent_weights * hidden_state
299       for (int k = 0; k < batch_size; k++) {
300         tensor_utils::MatrixBatchVectorMultiplyAccumulate(
301             recurrent_weights_ptr, num_units, num_units,
302             quantized_hidden_state_ptr_batch + k * num_units,
303             &scaling_factors[k],
304             /*n_batch=*/1, output_ptr_batch + k * output_batch_leading_dim,
305             /*result_stride=*/1);
306       }
307     }
308 
309     // Output = activation(Output) and update hidden_state
310     for (int k = 0; k < batch_size; k++) {
311       tensor_utils::ApplyActivationToVector(
312           output_ptr_batch + k * output_batch_leading_dim, num_units,
313           activation, output_ptr_batch + k * output_batch_leading_dim);
314       std::copy_n(output_ptr_batch + k * output_batch_leading_dim, num_units,
315                   hidden_state_ptr_batch + k * num_units);
316     }
317   }
318 }
319 
320 }  // namespace kernel_utils
321 }  // namespace tflite
322