• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
17 
18 #include "ruy/profiler/instrumentation.h"  // from @ruy
19 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
20 #include "tensorflow/lite/kernels/internal/types.h"
21 
22 namespace tflite {
23 namespace optimized_ops {
24 
25 // Implementation of float DepthwiseConv
26 
27 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
28 struct FloatDepthwiseConvKernel {};
29 
30 #ifdef USE_NEON
31 
32 template <>
33 struct FloatDepthwiseConvKernel<false, 8, 1> {
34   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
35                   const float* input_ptr, int input_ptr_increment,
36                   const float* filter_ptr, float* acc_buffer_ptr) {
37     // Load the filters
38     float32x4_t filter[2];
39     for (int i = 0; i < 2; i++) {
40       filter[i] = vld1q_f32(filter_ptr + 4 * i);
41     }
42     int outp = 0;
43     // Handle 2 output pixels at a time.
44     for (; outp <= num_output_pixels - 2; outp += 2) {
45       // Load the inputs
46       float32x4_t input[4];
47       for (int i = 0; i < 4; i++) {
48         input[i] = vld1q_f32(input_ptr + 4 * i);
49       }
50       input_ptr += 16;
51       // Load the accumulators from acc_buffer
52       float32x4_t acc[4];
53       for (int i = 0; i < 4; i++) {
54         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
55       }
56       // Multiply-accumulate
57       acc[0] = vmlaq_f32(acc[0], input[0], filter[0]);
58       acc[1] = vmlaq_f32(acc[1], input[1], filter[1]);
59       acc[2] = vmlaq_f32(acc[2], input[2], filter[0]);
60       acc[3] = vmlaq_f32(acc[3], input[3], filter[1]);
61       // Store the accumulators back to acc_buffer
62       for (int i = 0; i < 4; i++) {
63         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
64       }
65       acc_buffer_ptr += 16;
66     }
67     // Handle one output pixel at a time.
68     for (; outp < num_output_pixels; outp++) {
69       // Load the inputs
70       float32x4_t input[2];
71       for (int i = 0; i < 2; i++) {
72         input[i] = vld1q_f32(input_ptr + 4 * i);
73       }
74       input_ptr += 8;
75       // Load the accumulators from acc_buffer
76       float32x4_t acc[2];
77       for (int i = 0; i < 2; i++) {
78         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
79       }
80       // Multiply-accumulate
81       for (int i = 0; i < 2; i++) {
82         acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
83       }
84       // Store the accumulators back to acc_buffer
85       for (int i = 0; i < 2; i++) {
86         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
87       }
88       acc_buffer_ptr += 8;
89     }
90   }
91 };
92 
93 template <>
94 struct FloatDepthwiseConvKernel<false, 2, 1> {
95   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
96                   const float* input_ptr, int input_ptr_increment,
97                   const float* filter_ptr, float* acc_buffer_ptr) {
98     const float32x2_t filters = vld1_f32(filter_ptr);
99     const float32x4_t filters_dup2 = vcombine_f32(filters, filters);
100     int outp = 0;
101     // Handle 8 output pixels at a time.
102     for (; outp <= num_output_pixels - 8; outp += 8) {
103       // Load the inputs
104       float32x4_t input[4];
105       for (int i = 0; i < 4; i++) {
106         input[i] = vld1q_f32(input_ptr + 4 * i);
107       }
108       input_ptr += 16;
109       // Load the accumulators from acc_buffer
110       float32x4_t acc[4];
111       for (int i = 0; i < 4; i++) {
112         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
113       }
114       // Multiply-accumulate
115       for (int i = 0; i < 4; i++) {
116         acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
117       }
118       // Store the accumulators back to acc_buffer
119       for (int i = 0; i < 4; i++) {
120         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
121       }
122       acc_buffer_ptr += 16;
123     }
124     // Handle 4 output pixels at a time.
125     for (; outp <= num_output_pixels - 4; outp += 4) {
126       // Load the inputs
127       float32x4_t input[2];
128       for (int i = 0; i < 2; i++) {
129         input[i] = vld1q_f32(input_ptr + 4 * i);
130       }
131       input_ptr += 8;
132       // Load the accumulators from acc_buffer
133       float32x4_t acc[2];
134       for (int i = 0; i < 2; i++) {
135         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
136       }
137       // Multiply-accumulate
138       for (int i = 0; i < 2; i++) {
139         acc[i] = vmlaq_f32(acc[i], input[i], filters_dup2);
140       }
141       // Store the accumulators back to acc_buffer
142       for (int i = 0; i < 2; i++) {
143         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
144       }
145       acc_buffer_ptr += 8;
146     }
147     // Handle 2 output pixels at a time.
148     for (; outp <= num_output_pixels - 2; outp += 2) {
149       // Load the inputs
150       const float32x4_t input = vld1q_f32(input_ptr);
151       input_ptr += 4;
152       // Load the accumulators from acc_buffer
153       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
154       // Multiply-accumulate
155       acc = vmlaq_f32(acc, input, filters_dup2);
156       // Store the accumulators back to acc_buffer
157       vst1q_f32(acc_buffer_ptr, acc);
158       acc_buffer_ptr += 4;
159     }
160     // Handle 1 output pixel at a time
161     for (; outp < num_output_pixels; outp++) {
162       // Load the inputs
163       const float32x2_t input = vld1_f32(input_ptr);
164       input_ptr += 2;
165       // Load the accumulators from acc_buffer
166       float32x2_t acc = vld1_f32(acc_buffer_ptr);
167       // Multiply-accumulate
168       acc = vmla_f32(acc, input, filters);
169       // Store the accumulators back to acc_buffer
170       vst1_f32(acc_buffer_ptr, acc);
171       acc_buffer_ptr += 2;
172     }
173   }
174 };
175 
176 template <>
177 struct FloatDepthwiseConvKernel<true, 0, 1> {
178   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
179                   const float* input_ptr, int input_ptr_increment,
180                   const float* filter_ptr, float* acc_buffer_ptr) {
181     // Handle one output pixel at a time.
182     for (int outp = 0; outp < num_output_pixels; outp++) {
183       const float* local_filter_ptr = filter_ptr;
184       const float* local_input_ptr = input_ptr;
185       int ic = 0;
186       // Handle 16 input channels at a time.
187       for (; ic <= input_depth - 16; ic += 16) {
188         // Load the filters
189         float32x4_t filter_0 = vld1q_f32(local_filter_ptr + 4 * 0);
190         float32x4_t filter_1 = vld1q_f32(local_filter_ptr + 4 * 1);
191         float32x4_t filter_2 = vld1q_f32(local_filter_ptr + 4 * 2);
192         float32x4_t filter_3 = vld1q_f32(local_filter_ptr + 4 * 3);
193         local_filter_ptr += 16;
194         // Load the inputs
195         float32x4_t input_0 = vld1q_f32(local_input_ptr + 4 * 0);
196         float32x4_t input_1 = vld1q_f32(local_input_ptr + 4 * 1);
197         float32x4_t input_2 = vld1q_f32(local_input_ptr + 4 * 2);
198         float32x4_t input_3 = vld1q_f32(local_input_ptr + 4 * 3);
199         local_input_ptr += 16;
200         // Load the accumulators from acc_buffer
201         float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
202         float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
203         float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
204         float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
205         // Multiply-accumulate
206         acc_0 = vmlaq_f32(acc_0, input_0, filter_0);
207         acc_1 = vmlaq_f32(acc_1, input_1, filter_1);
208         acc_2 = vmlaq_f32(acc_2, input_2, filter_2);
209         acc_3 = vmlaq_f32(acc_3, input_3, filter_3);
210         // Store the accumulators back to acc_buffer
211         vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
212         vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
213         vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
214         vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
215         acc_buffer_ptr += 16;
216       }
217       // Handle 4 input channels at a time.
218       for (; ic <= input_depth - 4; ic += 4) {
219         // Load the filters
220         float32x4_t filter;
221         filter = vld1q_f32(local_filter_ptr);
222         local_filter_ptr += 4;
223         // Load the inputs
224         float32x4_t input;
225         input = vld1q_f32(local_input_ptr);
226         local_input_ptr += 4;
227         // Load the accumulators from acc_buffer
228         float32x4_t acc;
229         acc = vld1q_f32(acc_buffer_ptr);
230         // Multiply-accumulate
231         acc = vmlaq_f32(acc, input, filter);
232         // Store the accumulators back to acc_buffer
233         vst1q_f32(acc_buffer_ptr, acc);
234         acc_buffer_ptr += 4;
235       }
236       // Handle one input channel at a time.
237       for (; ic < input_depth; ic++) {
238         const float input_val = *local_input_ptr++;
239         const float filter_val = *local_filter_ptr++;
240         *acc_buffer_ptr++ += filter_val * input_val;
241       }
242       input_ptr += input_ptr_increment;
243     }
244   }
245 };
246 
247 template <>
248 struct FloatDepthwiseConvKernel<true, 0, 8> {
249   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
250                   const float* input_ptr, int input_ptr_increment,
251                   const float* filter_ptr, float* acc_buffer_ptr) {
252     // Handle one output pixel at a time.
253     for (int outp = 0; outp < num_output_pixels; outp++) {
254       const float* local_filter_ptr = filter_ptr;
255       const float* local_input_ptr = input_ptr;
256       int ic = 0;
257       // Handle 2 input channels at a time.
258       for (; ic <= input_depth - 2; ic += 2) {
259         // Load the filters
260         float32x4_t filter[4];
261         for (int i = 0; i < 4; i++) {
262           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
263         }
264         local_filter_ptr += 16;
265         // Load the inputs
266         const float32x2_t input = vld1_f32(local_input_ptr);
267         local_input_ptr += 2;
268         // Load the accumulators from acc_buffer
269         float32x4_t acc[4];
270         for (int i = 0; i < 4; i++) {
271           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
272         }
273         // Multiply-accumulate
274         acc[0] = vmlaq_lane_f32(acc[0], filter[0], input, 0);
275         acc[1] = vmlaq_lane_f32(acc[1], filter[1], input, 0);
276         acc[2] = vmlaq_lane_f32(acc[2], filter[2], input, 1);
277         acc[3] = vmlaq_lane_f32(acc[3], filter[3], input, 1);
278         // Store the accumulators back to acc_buffer
279         for (int i = 0; i < 4; i++) {
280           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
281         }
282         acc_buffer_ptr += 16;
283       }
284       // Handle one input channel at a time.
285       for (; ic < input_depth; ic++) {
286         // Load the filters
287         float32x4_t filter[2];
288         for (int i = 0; i < 2; i++) {
289           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
290         }
291         local_filter_ptr += 8;
292         // Load the inputs
293         const float input_val = *local_input_ptr++;
294         // Load the accumulators from acc_buffer
295         float32x4_t acc[2];
296         for (int i = 0; i < 2; i++) {
297           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
298         }
299         // Multiply-accumulate
300         for (int i = 0; i < 2; i++) {
301           acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
302         }
303         // Store the accumulators back to acc_buffer
304         for (int i = 0; i < 2; i++) {
305           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
306         }
307         acc_buffer_ptr += 8;
308       }
309       input_ptr += input_ptr_increment;
310     }
311   }
312 };
313 
314 // Note this implementation is very slow for input_depths < 8
315 // (e.g. comparable to reference implementation) see, specializations for
316 // input_depth=3 below.
317 template <>
318 struct FloatDepthwiseConvKernel<true, 0, 2> {
319   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
320                   const float* input_ptr, int input_ptr_increment,
321                   const float* filter_ptr, float* acc_buffer_ptr) {
322     // Handle one output pixel at a time.
323     for (int outp = 0; outp < num_output_pixels; outp++) {
324       const float* local_filter_ptr = filter_ptr;
325       const float* local_input_ptr = input_ptr;
326       int ic = 0;
327       // Handle 8 input channels at a time.
328       for (; ic <= input_depth - 8; ic += 8) {
329         // Load the filters
330         float32x4_t filter[4];
331         for (int i = 0; i < 4; i++) {
332           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
333         }
334         local_filter_ptr += 16;
335         // Load the inputs
336         float32x4x2_t input_dup2[2];
337         for (int i = 0; i < 2; i++) {
338           const float32x4_t input = vld1q_f32(local_input_ptr + 4 * i);
339           input_dup2[i] = vzipq_f32(input, input);
340         }
341         local_input_ptr += 8;
342         // Load the accumulators from acc_buffer
343         float32x4_t acc[4];
344         for (int i = 0; i < 4; i++) {
345           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
346         }
347         // Multiply-accumulate
348         acc[0] = vmlaq_f32(acc[0], filter[0], input_dup2[0].val[0]);
349         acc[1] = vmlaq_f32(acc[1], filter[1], input_dup2[0].val[1]);
350         acc[2] = vmlaq_f32(acc[2], filter[2], input_dup2[1].val[0]);
351         acc[3] = vmlaq_f32(acc[3], filter[3], input_dup2[1].val[1]);
352         // Store the accumulators back to acc_buffer
353         for (int i = 0; i < 4; i++) {
354           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
355         }
356         acc_buffer_ptr += 16;
357       }
358       // Handle 4 input channels at a time.
359       for (; ic <= input_depth - 4; ic += 4) {
360         // Load the filters
361         float32x2_t filter[4];
362         for (int i = 0; i < 4; i++) {
363           filter[i] = vld1_f32(local_filter_ptr + 2 * i);
364         }
365         local_filter_ptr += 8;
366         // Load the inputs
367         const float32x4_t input = vld1q_f32(local_input_ptr);
368         local_input_ptr += 4;
369         // Load the accumulators from acc_buffer
370         float32x2_t acc[4];
371         for (int i = 0; i < 4; i++) {
372           acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
373         }
374         // Multiply-accumulate
375         acc[0] = vmla_lane_f32(acc[0], filter[0], vget_low_f32(input), 0);
376         acc[1] = vmla_lane_f32(acc[1], filter[1], vget_low_f32(input), 1);
377         acc[2] = vmla_lane_f32(acc[2], filter[2], vget_high_f32(input), 0);
378         acc[3] = vmla_lane_f32(acc[3], filter[3], vget_high_f32(input), 1);
379         // Store the accumulators back to acc_buffer
380         for (int i = 0; i < 4; i++) {
381           vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
382         }
383         acc_buffer_ptr += 8;
384       }
385       // Handle 2 input channels at a time.
386       for (; ic <= input_depth - 2; ic += 2) {
387         // Load the filters
388         const float32x4_t filter = vld1q_f32(local_filter_ptr);
389         local_filter_ptr += 4;
390         // Load the inputs
391         const float32x2_t input = vld1_f32(local_input_ptr);
392         local_input_ptr += 2;
393         // Load the accumulators from acc_buffer
394         float32x2_t acc[2];
395         for (int i = 0; i < 2; i++) {
396           acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
397         }
398         // Multiply-accumulate
399         acc[0] = vmla_lane_f32(acc[0], vget_low_f32(filter), input, 0);
400         acc[1] = vmla_lane_f32(acc[1], vget_high_f32(filter), input, 1);
401         // Store the accumulators back to acc_buffer
402         for (int i = 0; i < 2; i++) {
403           vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
404         }
405         acc_buffer_ptr += 4;
406       }
407       // Handle one input channel at a time.
408       for (; ic < input_depth; ic++) {
409         // Load the inputs
410         const float input_val = *local_input_ptr++;
411         // Multiply-accumulate
412         for (int i = 0; i < 2; i++) {
413           acc_buffer_ptr[i] += local_filter_ptr[i] * input_val;
414         }
415         local_filter_ptr += 2;
416         acc_buffer_ptr += 2;
417       }
418       input_ptr += input_ptr_increment;
419     }
420   }
421 };
422 
423 template <>
424 struct FloatDepthwiseConvKernel<true, 3, 2> {
425   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
426                   const float* input_ptr, int input_ptr_increment,
427                   const float* filter_ptr, float* acc_buffer_ptr) {
428     // Load the filters
429     float32x2_t filter[3];
430     for (int i = 0; i < 3; i++) {
431       filter[i] = vld1_f32(filter_ptr + 2 * i);
432     }
433     // Handle one output pixel at a time.
434     for (int outp = 0; outp < num_output_pixels; outp++) {
435       const float32x2_t input01 = vld1_f32(input_ptr);
436       const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
437       // Load the accumulators from acc_buffer
438       float32x2_t acc[3];
439       for (int i = 0; i < 3; i++) {
440         acc[i] = vld1_f32(acc_buffer_ptr + 2 * i);
441       }
442       // Multiply-accumulate for each input channel there 2 outputs
443       acc[0] = vmla_lane_f32(acc[0], filter[0], input01, 0);
444       acc[1] = vmla_lane_f32(acc[1], filter[1], input01, 1);
445       acc[2] = vmla_lane_f32(acc[2], filter[2], input2, 0);
446       // Store the accumulators back to acc_buffer
447       for (int i = 0; i < 3; i++) {
448         vst1_f32(acc_buffer_ptr + 2 * i, acc[i]);
449       }
450       acc_buffer_ptr += 6;
451       input_ptr += input_ptr_increment;
452     }
453   }
454 };
455 
456 template <>
457 struct FloatDepthwiseConvKernel<true, 3, 4> {
458   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
459                   const float* input_ptr, int input_ptr_increment,
460                   const float* filter_ptr, float* acc_buffer_ptr) {
461     // Load the filters
462     float32x4_t filter[3];
463     for (int i = 0; i < 3; i++) {
464       filter[i] = vld1q_f32(filter_ptr + 4 * i);
465     }
466     // Handle one output pixel at a time.
467     for (int outp = 0; outp < num_output_pixels; outp++) {
468       // NOTE: we only want 3 values, so we read it as two ops where
469       // the second op just duplicates the lane
470       const float32x2_t input01 = vld1_f32(input_ptr);
471       const float32x2_t input2 = vld1_dup_f32(input_ptr + 2);
472       // Load the accumulators from acc_buffer
473       float32x4_t acc[3];
474       for (int i = 0; i < 3; i++) {
475         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
476       }
477       // Multiply-accumulate all outputs.
478       acc[0] = vmlaq_lane_f32(acc[0], filter[0], input01, 0);
479       acc[1] = vmlaq_lane_f32(acc[1], filter[1], input01, 1);
480       acc[2] = vmlaq_lane_f32(acc[2], filter[2], input2, 0);
481       // Store the accumulators back to acc_buffer
482       for (int i = 0; i < 3; i++) {
483         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
484       }
485       acc_buffer_ptr += 12;
486       input_ptr += input_ptr_increment;
487     }
488   }
489 };
490 
491 template <>
492 struct FloatDepthwiseConvKernel<true, 1, 8> {
493   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
494                   const float* input_ptr, int input_ptr_increment,
495                   const float* filter_ptr, float* acc_buffer_ptr) {
496     // Load the filters
497     float32x4_t filter[2];
498     for (int i = 0; i < 2; i++) {
499       filter[i] = vld1q_f32(filter_ptr + 4 * i);
500     }
501     // Handle one output pixel at a time.
502     for (int outp = 0; outp < num_output_pixels; outp++) {
503       // Load the inputs
504       const float input_val = *input_ptr;
505       input_ptr += input_ptr_increment;
506       // Load the accumulators from acc_buffer
507       float32x4_t acc[2];
508       for (int i = 0; i < 2; i++) {
509         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
510       }
511       // Multiply-accumulate
512       for (int i = 0; i < 2; i++) {
513         acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
514       }
515       // Store the accumulators back to acc_buffer
516       for (int i = 0; i < 2; i++) {
517         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
518       }
519       acc_buffer_ptr += 8;
520     }
521   }
522 };
523 
524 template <>
525 struct FloatDepthwiseConvKernel<true, 1, 32> {
526   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
527                   const float* input_ptr, int input_ptr_increment,
528                   const float* filter_ptr, float* acc_buffer_ptr) {
529     // Load the filters
530     float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
531     float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
532     float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
533     float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
534     float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
535     float32x4_t filter_5 = vld1q_f32(filter_ptr + 4 * 5);
536     float32x4_t filter_6 = vld1q_f32(filter_ptr + 4 * 6);
537     float32x4_t filter_7 = vld1q_f32(filter_ptr + 4 * 7);
538 
539     // Handle one output pixel at a time.
540     for (int outp = 0; outp < num_output_pixels; outp++) {
541       // Load the inputs
542       const float input_val = *input_ptr;
543       input_ptr += input_ptr_increment;
544       // Load the accumulators from acc_buffer
545       float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
546       float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
547       float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
548       float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
549       float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
550       float32x4_t acc_5 = vld1q_f32(acc_buffer_ptr + 4 * 5);
551       float32x4_t acc_6 = vld1q_f32(acc_buffer_ptr + 4 * 6);
552       float32x4_t acc_7 = vld1q_f32(acc_buffer_ptr + 4 * 7);
553       // Multiply-accumulate
554       acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
555       acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
556       acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
557       acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
558       acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
559       acc_5 = vmlaq_n_f32(acc_5, filter_5, input_val);
560       acc_6 = vmlaq_n_f32(acc_6, filter_6, input_val);
561       acc_7 = vmlaq_n_f32(acc_7, filter_7, input_val);
562       // Store the accumulators back to acc_buffer
563       vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
564       vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
565       vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
566       vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
567       vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
568       vst1q_f32(acc_buffer_ptr + 4 * 5, acc_5);
569       vst1q_f32(acc_buffer_ptr + 4 * 6, acc_6);
570       vst1q_f32(acc_buffer_ptr + 4 * 7, acc_7);
571       acc_buffer_ptr += 32;
572     }
573   }
574 };
575 
576 template <>
577 struct FloatDepthwiseConvKernel<true, 1, 20> {
578   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
579                   const float* input_ptr, int input_ptr_increment,
580                   const float* filter_ptr, float* acc_buffer_ptr) {
581     // Load the filters
582     float32x4_t filter_0 = vld1q_f32(filter_ptr + 4 * 0);
583     float32x4_t filter_1 = vld1q_f32(filter_ptr + 4 * 1);
584     float32x4_t filter_2 = vld1q_f32(filter_ptr + 4 * 2);
585     float32x4_t filter_3 = vld1q_f32(filter_ptr + 4 * 3);
586     float32x4_t filter_4 = vld1q_f32(filter_ptr + 4 * 4);
587 
588     // Handle one output pixel at a time.
589     for (int outp = 0; outp < num_output_pixels; outp++) {
590       // Load the inputs
591       const float input_val = *input_ptr;
592       input_ptr += input_ptr_increment;
593       // Load the accumulators from acc_buffer
594       float32x4_t acc_0 = vld1q_f32(acc_buffer_ptr + 4 * 0);
595       float32x4_t acc_1 = vld1q_f32(acc_buffer_ptr + 4 * 1);
596       float32x4_t acc_2 = vld1q_f32(acc_buffer_ptr + 4 * 2);
597       float32x4_t acc_3 = vld1q_f32(acc_buffer_ptr + 4 * 3);
598       float32x4_t acc_4 = vld1q_f32(acc_buffer_ptr + 4 * 4);
599       // Multiply-accumulate
600       acc_0 = vmlaq_n_f32(acc_0, filter_0, input_val);
601       acc_1 = vmlaq_n_f32(acc_1, filter_1, input_val);
602       acc_2 = vmlaq_n_f32(acc_2, filter_2, input_val);
603       acc_3 = vmlaq_n_f32(acc_3, filter_3, input_val);
604       acc_4 = vmlaq_n_f32(acc_4, filter_4, input_val);
605       // Store the accumulators back to acc_buffer
606       vst1q_f32(acc_buffer_ptr + 4 * 0, acc_0);
607       vst1q_f32(acc_buffer_ptr + 4 * 1, acc_1);
608       vst1q_f32(acc_buffer_ptr + 4 * 2, acc_2);
609       vst1q_f32(acc_buffer_ptr + 4 * 3, acc_3);
610       vst1q_f32(acc_buffer_ptr + 4 * 4, acc_4);
611       acc_buffer_ptr += 20;
612     }
613   }
614 };
615 
616 template <>
617 struct FloatDepthwiseConvKernel<true, 0, 16> {
618   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
619                   const float* input_ptr, int input_ptr_increment,
620                   const float* filter_ptr, float* acc_buffer_ptr) {
621     // Handle one output pixel at a time.
622     for (int outp = 0; outp < num_output_pixels; outp++) {
623       const float* local_filter_ptr = filter_ptr;
624       const float* local_input_ptr = input_ptr;
625       for (int ic = 0; ic < input_depth; ic++) {
626         // Load the filters
627         float32x4_t filter[4];
628         for (int i = 0; i < 4; i++) {
629           filter[i] = vld1q_f32(local_filter_ptr + 4 * i);
630         }
631         local_filter_ptr += 16;
632         // Load the inputs
633         const float input_val = *local_input_ptr++;
634         // Load the accumulators from acc_buffer
635         float32x4_t acc[4];
636         for (int i = 0; i < 4; i++) {
637           acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
638         }
639         // Multiply-accumulate
640         for (int i = 0; i < 4; i++) {
641           acc[i] = vmlaq_n_f32(acc[i], filter[i], input_val);
642         }
643         // Store the accumulators back to acc_buffer
644         for (int i = 0; i < 4; i++) {
645           vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
646         }
647         acc_buffer_ptr += 16;
648       }
649       input_ptr += input_ptr_increment;
650     }
651   }
652 };
653 
654 template <>
655 struct FloatDepthwiseConvKernel<true, 8, 1> {
656   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
657                   const float* input_ptr, int input_ptr_increment,
658                   const float* filter_ptr, float* acc_buffer_ptr) {
659     // Load the filters
660     float32x4_t filter[2];
661     for (int i = 0; i < 2; i++) {
662       filter[i] = vld1q_f32(filter_ptr + 4 * i);
663     }
664     // Handle one output pixel at a time.
665     for (int outp = 0; outp < num_output_pixels; outp++) {
666       // Load the inputs
667       float32x4_t input[2];
668       for (int i = 0; i < 2; i++) {
669         input[i] = vld1q_f32(input_ptr + 4 * i);
670       }
671       // Load the accumulators from acc_buffer
672       float32x4_t acc[2];
673       for (int i = 0; i < 2; i++) {
674         acc[i] = vld1q_f32(acc_buffer_ptr + 4 * i);
675       }
676       // Multiply-accumulate
677       for (int i = 0; i < 2; i++) {
678         acc[i] = vmlaq_f32(acc[i], input[i], filter[i]);
679       }
680       // Store the accumulators back to acc_buffer
681       for (int i = 0; i < 2; i++) {
682         vst1q_f32(acc_buffer_ptr + 4 * i, acc[i]);
683       }
684       acc_buffer_ptr += 8;
685       input_ptr += input_ptr_increment;
686     }
687   }
688 };
689 
690 template <>
691 struct FloatDepthwiseConvKernel<true, 2, 1> {
692   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
693                   const float* input_ptr, int input_ptr_increment,
694                   const float* filter_ptr, float* acc_buffer_ptr) {
695     float32x2_t filter = vld1_f32(filter_ptr);
696     float32x4_t filter_x4 = vcombine_f32(filter, filter);
697     int outp = 0;
698 
699     // Handle two output pixels at a time.
700     for (; outp <= num_output_pixels - 2; outp += 2) {
701       // Load the inputs
702       float32x2_t input_1 = vld1_f32(input_ptr);
703       input_ptr += input_ptr_increment;
704       float32x2_t input_2 = vld1_f32(input_ptr);
705       input_ptr += input_ptr_increment;
706       float32x4_t input = vcombine_f32(input_1, input_2);
707 
708       // Load the accumulators from acc_buffer
709       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
710 
711       // Multiply-accumulate
712       acc = vmlaq_f32(acc, input, filter_x4);
713 
714       // Store the accumulators back to acc_buffer
715       vst1q_f32(acc_buffer_ptr, acc);
716       acc_buffer_ptr += 4;
717     }
718     // Handle one output pixel at a time.
719     for (; outp < num_output_pixels; outp++) {
720       // Load the inputs
721       float32x2_t input = vld1_f32(input_ptr);
722       input_ptr += input_ptr_increment;
723 
724       // Load the accumulators from acc_buffer
725       float32x2_t acc = vld1_f32(acc_buffer_ptr);
726 
727       // Multiply-accumulate
728       acc = vmla_f32(acc, input, filter);
729 
730       // Store the accumulators back to acc_buffer
731       vst1_f32(acc_buffer_ptr, acc);
732       acc_buffer_ptr += 2;
733     }
734   }
735 };
736 
737 template <>
738 struct FloatDepthwiseConvKernel<true, 4, 1> {
739   static void Run(int num_output_pixels, int input_depth, int depth_multiplier,
740                   const float* input_ptr, int input_ptr_increment,
741                   const float* filter_ptr, float* acc_buffer_ptr) {
742     float32x4_t filter = vld1q_f32(filter_ptr);
743 
744     // Handle one output pixel at a time.
745     for (int outp = 0; outp < num_output_pixels; outp++) {
746       // Load the inputs
747       float32x4_t input = vld1q_f32(input_ptr);
748       // Load the accumulators from acc_buffer
749       float32x4_t acc = vld1q_f32(acc_buffer_ptr);
750       // Multiply-accumulate
751       acc = vmlaq_f32(acc, input, filter);
752       // Store the accumulators back to acc_buffer
753       vst1q_f32(acc_buffer_ptr, acc);
754       acc_buffer_ptr += 4;
755       input_ptr += input_ptr_increment;
756     }
757   }
758 };
759 #endif
760 
761 // Accumulates the effect of one row of the filter, on a segment of one row
762 // of the output, accessing the corresponding one row of the input.
763 template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
764 void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
765                                 int input_depth, int input_width,
766                                 const float* input_data, int pad_width,
767                                 int depth_multiplier, int filter_width,
768                                 const float* filter_data,
769                                 int out_x_buffer_start, int out_x_buffer_end,
770                                 int output_depth, float* acc_buffer) {
771   ruy::profiler::ScopeLabel label(__PRETTY_FUNCTION__);
772   // Consistency check parameters. This is important in particular to ensure
773   // that we keep the number of template instantiations minimal, so we don't
774   // increase binary size unnecessarily.
775   static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
776   static_assert(kFixedInputDepth || kAllowStrided, "");
777   TFLITE_DCHECK(stride == 1 || kAllowStrided);
778   if (kFixedInputDepth) {
779     TFLITE_DCHECK_EQ(input_depth, kFixedInputDepth);
780   }
781   if (kFixedDepthMultiplier) {
782     TFLITE_DCHECK_EQ(depth_multiplier, kFixedDepthMultiplier);
783   }
784   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
785   const int input_ptr_increment = stride * input_depth;
786   const float* filter_base_ptr = filter_data;
787   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
788     // For the current (filter_x, filter_y) point in the filter,
789     // compute the boundaries of the corresponding output row segment.
790     int out_x_loop_start_unclamped = 0;
791     int out_x_loop_end_unclamped = 0;
792     if (kAllowStrided) {
793       if (stride == 2) {
794         out_x_loop_start_unclamped =
795             (pad_width - dilation_factor * filter_x + 1) / 2;
796         out_x_loop_end_unclamped =
797             (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
798       } else if (stride == 4) {
799         out_x_loop_start_unclamped =
800             (pad_width - dilation_factor * filter_x + 3) / 4;
801         out_x_loop_end_unclamped =
802             (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
803       } else {
804         out_x_loop_start_unclamped =
805             (pad_width - dilation_factor * filter_x + stride - 1) / stride;
806         out_x_loop_end_unclamped = (pad_width + input_width -
807                                     dilation_factor * filter_x + stride - 1) /
808                                    stride;
809       }
810     } else {
811       out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
812       out_x_loop_end_unclamped =
813           pad_width + input_width - dilation_factor * filter_x;
814     }
815     // The kernel will have to iterate on the segment of the
816     // output row that starts at out_x_loop_start and out_x_loop_end.
817     const int out_x_loop_start =
818         std::max(out_x_buffer_start, out_x_loop_start_unclamped);
819     const int out_x_loop_end =
820         std::min(out_x_buffer_end, out_x_loop_end_unclamped);
821 
822     float* acc_buffer_ptr =
823         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
824     const int in_x_origin =
825         (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
826     const float* input_ptr = input_data + in_x_origin * input_depth;
827     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
828     FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
829                              kFixedDepthMultiplier>::Run(num_output_pixels,
830                                                          input_depth,
831                                                          depth_multiplier,
832                                                          input_ptr,
833                                                          input_ptr_increment,
834                                                          filter_base_ptr,
835                                                          acc_buffer_ptr);
836     filter_base_ptr += output_depth;
837   }
838 }
839 
840 // generic fallback of FloatDepthwiseConvAccumRow, portable, non-templatized.
841 inline void FloatDepthwiseConvAccumRowGeneric(
842     int stride, int dilation_factor, int input_depth, int input_width,
843     const float* input_data, int pad_width, int depth_multiplier,
844     int filter_width, const float* filter_data, int out_x_buffer_start,
845     int out_x_buffer_end, int output_depth, float* acc_buffer) {
846   ruy::profiler::ScopeLabel label("DepthwiseConvAccumRowGeneric (slow)");
847   const float* filter_base_ptr = filter_data;
848   for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
849     const int out_x_loop_start = std::max(
850         out_x_buffer_start,
851         (pad_width - dilation_factor * filter_x + stride - 1) / stride);
852     const int out_x_loop_end = std::min(
853         out_x_buffer_end,
854         (pad_width + input_width - dilation_factor * filter_x + stride - 1) /
855             stride);
856 
857     float* acc_buffer_ptr =
858         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
859     const int in_x_origin =
860         (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
861     const float* input_ptr = input_data + in_x_origin * input_depth;
862     const int input_ptr_increment = (stride - 1) * input_depth;
863     for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++) {
864       const float* filter_ptr = filter_base_ptr;
865       for (int ic = 0; ic < input_depth; ++ic) {
866         const float input_val = *input_ptr++;
867         for (int m = 0; m < depth_multiplier; m++) {
868           const float filter_val = *filter_ptr++;
869           *acc_buffer_ptr++ += filter_val * input_val;
870         }
871       }
872       input_ptr += input_ptr_increment;
873     }
874     filter_base_ptr += output_depth;
875   }
876 }
877 
878 // Initializes the accumulator buffer with bias values.
879 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
880                                        const float* bias_data,
881                                        float* acc_buffer) {
882   // TODO(benoitjacob): This might need optimized specializations
883   // for small output_depth values, if that ever becomes an important
884   // case (like it was for some quantized DepthwiseConv cases).
885   for (int i = 0; i < num_output_pixels; i++) {
886     memcpy(acc_buffer + i * output_depth, bias_data,
887            sizeof(acc_buffer[0]) * output_depth);
888   }
889 }
890 
891 // DepthwiseConv can run with multi threads on the dim specified by thread_dim.
892 // Each thread processes output elements on dim, thread_dim, in the range of
893 // [thread_start, thread_end).
894 // For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
895 // means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
896 //
897 // The cpu_flags is currently unused. This
898 // parameter is included so that the signature matches that required by a
899 // templated function. Other versions, such as quantized, need this parameter.
900 inline void DepthwiseConvImpl(
901     const DepthwiseParams& params, const RuntimeShape& input_shape,
902     const float* input_data, const RuntimeShape& filter_shape,
903     const float* filter_data, const RuntimeShape& bias_shape,
904     const float* bias_data, const RuntimeShape& output_shape,
905     float* output_data, const CpuFlags& /* cpu_flags */, int thread_start,
906     int thread_end, int thread_dim) {
907   ruy::profiler::ScopeLabel label("DepthwiseConv/float/DepthwiseConvImpl");
908 
909   const int stride_width = params.stride_width;
910   const int stride_height = params.stride_height;
911   const int pad_width = params.padding_values.width;
912   const int pad_height = params.padding_values.height;
913   const int depth_multiplier = params.depth_multiplier;
914   const float output_activation_min = params.float_activation_min;
915   const float output_activation_max = params.float_activation_max;
916   const int dilation_width_factor = params.dilation_width_factor;
917   const int dilation_height_factor = params.dilation_height_factor;
918   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
919   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
920   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
921   TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
922 
923   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
924   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
925   const int input_height = input_shape.Dims(1);
926   const int input_width = input_shape.Dims(2);
927   const int input_depth = input_shape.Dims(3);
928   const int filter_height = filter_shape.Dims(1);
929   const int filter_width = filter_shape.Dims(2);
930   const int output_height = output_shape.Dims(1);
931   const int output_width = output_shape.Dims(2);
932   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
933   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
934 
935   static const int kAccBufferMaxSize = 4832;
936   float acc_buffer[kAccBufferMaxSize];
937   TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
938   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
939   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
940   TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
941                    kAccBufferActualSize);
942   TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
943   TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
944 
945   // row_accum_func will point to the core accumulation function to be used
946   // for this DepthwiseConv op.
947   using row_accum_func_t = decltype(&FloatDepthwiseConvAccumRowGeneric);
948   row_accum_func_t row_accum_func = nullptr;
949 
950 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
951                                         FIXED_DEPTH_MULTIPLIER)           \
952   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
953       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
954       depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
955     row_accum_func =                                                      \
956         FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
957                                    FIXED_DEPTH_MULTIPLIER>;               \
958   }
959 
960 #ifdef USE_NEON
961   // We go over our list of kernels by decreasing order of preference
962   // for the cases where multiple kernels could apply.
963 
964   // Start with the fastest kernels: AllowStrided=false, fixed input depth.
965 
966   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
967   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
968 
969   // Next come the strided kernels: AllowStrided=true, fixed input depth.
970   // They are a bit less efficient, but allow stride!=1.
971 
972   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
973   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
974   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
975   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
976   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
977   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 2)
978   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 3, 4)
979   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
980 
981   // Finally, the kernels allowing a variable input depth,
982   // these are the least efficient but most general kernels.
983 
984   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
985   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
986   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 8)
987   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 16)
988 
989 #endif  // USE_NEON
990 
991 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
992 
993   // No matching fast kernel found, use slow fallback.
994   if (!row_accum_func) {
995     row_accum_func = FloatDepthwiseConvAccumRowGeneric;
996   }
997 
998   const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
999   const int input_batch_stride = input_height_stride * input_shape.Dims(1);
1000   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
1001 
1002   // Now that we have determined row_accum_func, we can start work.
1003   int batch_start = 0;
1004   int batch_end = batches;
1005   int row_start = 0;
1006   int row_end = output_height;
1007   int output_ptr_offset = 0;
1008 
1009   switch (thread_dim) {
1010     case 0:
1011       // Multithread along with the batch axis
1012       TFLITE_DCHECK_GE(thread_start, 0);
1013       TFLITE_DCHECK_LE(thread_end, batches);
1014       batch_start = thread_start;
1015       batch_end = thread_end;
1016       output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
1017       break;
1018     case 1:
1019       // Multithread along with the row axis
1020       TFLITE_DCHECK_GE(thread_start, 0);
1021       TFLITE_DCHECK_LE(thread_end, output_height);
1022       row_start = thread_start;
1023       row_end = thread_end;
1024       output_ptr_offset = row_start * output_width * output_depth;
1025       break;
1026   }
1027 
1028   float* output_ptr = output_data + output_ptr_offset;
1029   int batch_step =
1030       (output_height + row_start - row_end) * output_width * output_depth;
1031 
1032   for (int b = batch_start; b < batch_end; ++b) {
1033     for (int out_y = row_start; out_y < row_end; ++out_y) {
1034       const int in_y_origin = (out_y * stride_height) - pad_height;
1035       const int filter_y_start =
1036           std::max(0, (-in_y_origin + dilation_height_factor - 1) /
1037                           dilation_height_factor);
1038       const int filter_y_end =
1039           std::min(filter_height,
1040                    (input_height - in_y_origin + dilation_height_factor - 1) /
1041                        dilation_height_factor);
1042       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
1043            out_x_buffer_start += kOutputPixelsInAccBuffer) {
1044         const int out_x_buffer_end = std::min(
1045             output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
1046         // We call a 'pixel' a group of activation that share all but the
1047         // 'depth'/'channel' coordinate. num_output_pixels is the number of
1048         // output pixels that we will accumulate in this loop iteration.
1049         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
1050         // Initialize our local accumulator with the bias values, so we don't
1051         // have to add them later.
1052         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data,
1053                                    acc_buffer);
1054         // Accumulation loop. Most of the time should be spent in here.
1055         for (int filter_y = filter_y_start; filter_y < filter_y_end;
1056              ++filter_y) {
1057           const int in_y = in_y_origin + dilation_height_factor * filter_y;
1058           row_accum_func(
1059               stride_width, dilation_width_factor, input_depth, input_width,
1060               input_data + in_y * input_height_stride + b * input_batch_stride,
1061               pad_width, depth_multiplier, filter_width,
1062               filter_data + filter_y * filter_height_stride, out_x_buffer_start,
1063               out_x_buffer_end, output_depth, acc_buffer);
1064         }
1065         // Finished accumulating. Now store to destination.
1066         const int num_output_values = output_depth * num_output_pixels;
1067         int i = 0;
1068 // TODO(benoitjacob) optimized code goes here
1069 #ifdef USE_NEON
1070         // Handle 16 values at a time
1071         for (; i <= num_output_values - 16; i += 16) {
1072           float32x4_t acc[4];
1073           for (int k = 0; k < 4; k++) {
1074             acc[k] = vld1q_f32(acc_buffer + i + 4 * k);
1075           }
1076           for (int k = 0; k < 4; k++) {
1077             acc[k] = vmaxq_f32(
1078                 vdupq_n_f32(output_activation_min),
1079                 vminq_f32(vdupq_n_f32(output_activation_max), acc[k]));
1080           }
1081           for (int k = 0; k < 4; k++) {
1082             vst1q_f32(output_ptr + 4 * k, acc[k]);
1083           }
1084           output_ptr += 16;
1085         }
1086         // Handle 4 values at a time
1087         for (; i <= num_output_values - 4; i += 4) {
1088           float32x4_t acc = vld1q_f32(acc_buffer + i);
1089 
1090           acc = vmaxq_f32(vdupq_n_f32(output_activation_min),
1091                           vminq_f32(vdupq_n_f32(output_activation_max), acc));
1092 
1093           vst1q_f32(output_ptr, acc);
1094           output_ptr += 4;
1095         }
1096 #endif
1097         // Handle leftover values, one by one. This is very slow.
1098         for (; i < num_output_values; i++) {
1099           float acc = acc_buffer[i];
1100           acc = std::max(output_activation_min,
1101                          std::min(output_activation_max, acc));
1102 
1103           *output_ptr++ = acc;
1104         }
1105       }
1106     }
1107     output_ptr += batch_step;
1108   }
1109 }
1110 
1111 
1112 }  // namespace optimized_ops
1113 }  // namespace tflite
1114 
1115 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
1116