• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <gtest/gtest.h>
7 
8 #include <xnnpack/common.h>
9 #include <xnnpack/isa-checks.h>
10 
11 #include <xnnpack/conv.h>
12 #include "conv-hwc-microkernel-tester.h"
13 
14 
15 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,input_width_eq_4)16   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, input_width_eq_4) {
17     TEST_REQUIRES_ARM_NEON;
18     ConvHWCMicrokernelTester()
19       .kernel_size(3)
20       .subsampling(2)
21       .padding_width(1)
22       .input_channels(3)
23       .output_channels_tile(8)
24       .output_channels(8)
25       .input_width(4)
26       .input_height(3)
27       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
28   }
29 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,input_width_div_4)30   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, input_width_div_4) {
31     TEST_REQUIRES_ARM_NEON;
32     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
33       ConvHWCMicrokernelTester()
34         .kernel_size(3)
35         .subsampling(2)
36         .padding_width(1)
37         .input_channels(3)
38         .output_channels_tile(8)
39         .output_channels(8)
40         .input_width(input_width)
41         .input_height(3)
42         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
43     }
44   }
45 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,input_width_lt_4)46   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, input_width_lt_4) {
47     TEST_REQUIRES_ARM_NEON;
48     for (size_t input_width = 1; input_width < 4; input_width++) {
49       ConvHWCMicrokernelTester()
50         .kernel_size(3)
51         .subsampling(2)
52         .padding_width(1)
53         .input_channels(3)
54         .output_channels_tile(8)
55         .output_channels(8)
56         .input_width(input_width)
57         .input_height(3)
58         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
59     }
60   }
61 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,input_width_gt_4)62   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, input_width_gt_4) {
63     TEST_REQUIRES_ARM_NEON;
64     for (size_t input_width = 5; input_width < 8; input_width++) {
65       ConvHWCMicrokernelTester()
66         .kernel_size(3)
67         .subsampling(2)
68         .padding_width(1)
69         .input_channels(3)
70         .output_channels_tile(8)
71         .output_channels(8)
72         .input_width(input_width)
73         .input_height(3)
74         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
75     }
76   }
77 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,output_channels_lt_8)78   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, output_channels_lt_8) {
79     TEST_REQUIRES_ARM_NEON;
80     for (size_t output_channels = 1; output_channels < 8; output_channels++) {
81       for (size_t input_width = 1; input_width < 32; input_width += 7) {
82         ConvHWCMicrokernelTester()
83           .kernel_size(3)
84           .subsampling(2)
85           .padding_width(1)
86           .input_channels(3)
87           .output_channels_tile(8)
88           .output_channels(output_channels)
89           .input_width(input_width)
90           .input_height(3)
91           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
92       }
93     }
94   }
95 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,output_channels_div_8)96   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, output_channels_div_8) {
97     TEST_REQUIRES_ARM_NEON;
98     for (size_t output_channels = 16; output_channels <= 32; output_channels += 8) {
99       for (size_t input_width = 1; input_width < 32; input_width += 7) {
100         ConvHWCMicrokernelTester()
101           .kernel_size(3)
102           .subsampling(2)
103           .padding_width(1)
104           .input_channels(3)
105           .output_channels_tile(8)
106           .output_channels(output_channels)
107           .input_width(input_width)
108           .input_height(3)
109           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
110       }
111     }
112   }
113 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,output_channels_gt_8)114   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, output_channels_gt_8) {
115     TEST_REQUIRES_ARM_NEON;
116     for (size_t output_channels = 9; output_channels < 16; output_channels++) {
117       for (size_t input_width = 1; input_width < 32; input_width += 7) {
118         ConvHWCMicrokernelTester()
119           .kernel_size(3)
120           .subsampling(2)
121           .padding_width(1)
122           .input_channels(3)
123           .output_channels_tile(8)
124           .output_channels(output_channels)
125           .input_width(input_width)
126           .input_height(3)
127           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
128       }
129     }
130   }
131 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,input_height_lt_3)132   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, input_height_lt_3) {
133     TEST_REQUIRES_ARM_NEON;
134     for (size_t input_height = 1; input_height < 3; input_height++) {
135       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
136         for (size_t input_width = 1; input_width < 32; input_width += 7) {
137           ConvHWCMicrokernelTester()
138             .kernel_size(3)
139             .subsampling(2)
140             .padding(1) // padded input height of at least 3 required
141             .input_channels(3)
142             .output_channels_tile(8)
143             .output_channels(output_channels)
144             .input_width(input_width)
145             .input_height(input_height)
146             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
147         }
148       }
149     }
150   }
151 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,input_height_gt_3)152   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, input_height_gt_3) {
153     TEST_REQUIRES_ARM_NEON;
154     for (size_t input_height = 4; input_height <= 9; input_height++) {
155       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
156         for (size_t input_width = 1; input_width < 32; input_width += 7) {
157           ConvHWCMicrokernelTester()
158             .kernel_size(3)
159             .subsampling(2)
160             .padding_width(1)
161             .input_channels(3)
162             .output_channels_tile(8)
163             .output_channels(output_channels)
164             .input_width(input_width)
165             .input_height(input_height)
166             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
167         }
168       }
169     }
170   }
171 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,padding_top)172   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, padding_top) {
173     TEST_REQUIRES_ARM_NEON;
174     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
175       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
176         for (size_t input_width = 1; input_width < 32; input_width += 7) {
177           ConvHWCMicrokernelTester()
178             .kernel_size(3)
179             .subsampling(2)
180             .padding_width(1)
181             .padding_top(padding_top)
182             .input_channels(3)
183             .output_channels_tile(8)
184             .output_channels(output_channels)
185             .input_width(input_width)
186             .input_height(9)
187             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
188         }
189       }
190     }
191   }
192 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,padding_bottom)193   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, padding_bottom) {
194     TEST_REQUIRES_ARM_NEON;
195     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
196       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
197         for (size_t input_width = 1; input_width < 32; input_width += 7) {
198           ConvHWCMicrokernelTester()
199             .kernel_size(3)
200             .subsampling(2)
201             .padding_width(1)
202             .padding_bottom(padding_bottom)
203             .input_channels(3)
204             .output_channels_tile(8)
205             .output_channels(output_channels)
206             .input_width(input_width)
207             .input_height(9)
208             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
209         }
210       }
211     }
212   }
213 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,output_y_start)214   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, output_y_start) {
215     TEST_REQUIRES_ARM_NEON;
216     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
217       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
218         for (size_t input_width = 1; input_width < 32; input_width += 7) {
219           ConvHWCMicrokernelTester()
220             .kernel_size(3)
221             .subsampling(2)
222             .padding_width(1)
223             .input_channels(3)
224             .output_channels_tile(8)
225             .output_channels(output_channels)
226             .input_width(input_width)
227             .input_height(9)
228             .output_y_start(output_y_start)
229             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
230         }
231       }
232     }
233   }
234 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,output_y_end)235   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, output_y_end) {
236     TEST_REQUIRES_ARM_NEON;
237     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
238       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
239         for (size_t input_width = 1; input_width < 32; input_width += 7) {
240           ConvHWCMicrokernelTester()
241             .kernel_size(3)
242             .subsampling(2)
243             .padding_width(1)
244             .input_channels(3)
245             .output_channels_tile(8)
246             .output_channels(output_channels)
247             .input_width(input_width)
248             .input_height(9)
249             .output_y_end(output_y_end)
250             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
251         }
252       }
253     }
254   }
255 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,qmin)256   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, qmin) {
257     TEST_REQUIRES_ARM_NEON;
258     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
259       for (size_t input_width = 1; input_width < 32; input_width += 7) {
260         ConvHWCMicrokernelTester()
261           .kernel_size(3)
262           .subsampling(2)
263           .padding_width(1)
264           .input_channels(3)
265           .output_channels_tile(8)
266           .output_channels(output_channels)
267           .input_width(input_width)
268           .input_height(6)
269           .qmin(128)
270           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
271       }
272     }
273   }
274 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2,qmax)275   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X2, qmax) {
276     TEST_REQUIRES_ARM_NEON;
277     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
278       for (size_t input_width = 1; input_width < 32; input_width += 7) {
279         ConvHWCMicrokernelTester()
280           .kernel_size(3)
281           .subsampling(2)
282           .padding_width(1)
283           .input_channels(3)
284           .output_channels_tile(8)
285           .output_channels(output_channels)
286           .input_width(input_width)
287           .input_height(6)
288           .qmax(128)
289           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x2);
290       }
291     }
292   }
293 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
294 
295 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,input_width_eq_4)296   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, input_width_eq_4) {
297     TEST_REQUIRES_ARM_NEON;
298     ConvHWCMicrokernelTester()
299       .kernel_size(3)
300       .subsampling(2)
301       .padding_width(1)
302       .input_channels(3)
303       .output_channels_tile(4)
304       .output_channels(4)
305       .input_width(4)
306       .input_height(3)
307       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
308   }
309 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,input_width_div_4)310   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, input_width_div_4) {
311     TEST_REQUIRES_ARM_NEON;
312     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
313       ConvHWCMicrokernelTester()
314         .kernel_size(3)
315         .subsampling(2)
316         .padding_width(1)
317         .input_channels(3)
318         .output_channels_tile(4)
319         .output_channels(4)
320         .input_width(input_width)
321         .input_height(3)
322         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
323     }
324   }
325 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,input_width_lt_4)326   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, input_width_lt_4) {
327     TEST_REQUIRES_ARM_NEON;
328     for (size_t input_width = 1; input_width < 4; input_width++) {
329       ConvHWCMicrokernelTester()
330         .kernel_size(3)
331         .subsampling(2)
332         .padding_width(1)
333         .input_channels(3)
334         .output_channels_tile(4)
335         .output_channels(4)
336         .input_width(input_width)
337         .input_height(3)
338         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
339     }
340   }
341 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,input_width_gt_4)342   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, input_width_gt_4) {
343     TEST_REQUIRES_ARM_NEON;
344     for (size_t input_width = 5; input_width < 8; input_width++) {
345       ConvHWCMicrokernelTester()
346         .kernel_size(3)
347         .subsampling(2)
348         .padding_width(1)
349         .input_channels(3)
350         .output_channels_tile(4)
351         .output_channels(4)
352         .input_width(input_width)
353         .input_height(3)
354         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
355     }
356   }
357 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,output_channels_lt_4)358   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, output_channels_lt_4) {
359     TEST_REQUIRES_ARM_NEON;
360     for (size_t output_channels = 1; output_channels < 4; output_channels++) {
361       for (size_t input_width = 1; input_width < 32; input_width += 7) {
362         ConvHWCMicrokernelTester()
363           .kernel_size(3)
364           .subsampling(2)
365           .padding_width(1)
366           .input_channels(3)
367           .output_channels_tile(4)
368           .output_channels(output_channels)
369           .input_width(input_width)
370           .input_height(3)
371           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
372       }
373     }
374   }
375 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,output_channels_div_4)376   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, output_channels_div_4) {
377     TEST_REQUIRES_ARM_NEON;
378     for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
379       for (size_t input_width = 1; input_width < 32; input_width += 7) {
380         ConvHWCMicrokernelTester()
381           .kernel_size(3)
382           .subsampling(2)
383           .padding_width(1)
384           .input_channels(3)
385           .output_channels_tile(4)
386           .output_channels(output_channels)
387           .input_width(input_width)
388           .input_height(3)
389           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
390       }
391     }
392   }
393 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,output_channels_gt_4)394   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, output_channels_gt_4) {
395     TEST_REQUIRES_ARM_NEON;
396     for (size_t output_channels = 5; output_channels < 8; output_channels++) {
397       for (size_t input_width = 1; input_width < 32; input_width += 7) {
398         ConvHWCMicrokernelTester()
399           .kernel_size(3)
400           .subsampling(2)
401           .padding_width(1)
402           .input_channels(3)
403           .output_channels_tile(4)
404           .output_channels(output_channels)
405           .input_width(input_width)
406           .input_height(3)
407           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
408       }
409     }
410   }
411 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,input_height_lt_3)412   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, input_height_lt_3) {
413     TEST_REQUIRES_ARM_NEON;
414     for (size_t input_height = 1; input_height < 3; input_height++) {
415       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
416         for (size_t input_width = 1; input_width < 32; input_width += 7) {
417           ConvHWCMicrokernelTester()
418             .kernel_size(3)
419             .subsampling(2)
420             .padding(1)
421             .input_channels(3) // padded input height of at least 3 required
422             .output_channels_tile(4)
423             .output_channels(output_channels)
424             .input_width(input_width)
425             .input_height(input_height)
426             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
427         }
428       }
429     }
430   }
431 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,input_height_gt_3)432   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, input_height_gt_3) {
433     TEST_REQUIRES_ARM_NEON;
434     for (size_t input_height = 4; input_height <= 9; input_height++) {
435       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
436         for (size_t input_width = 1; input_width < 32; input_width += 7) {
437           ConvHWCMicrokernelTester()
438             .kernel_size(3)
439             .subsampling(2)
440             .padding_width(1)
441             .input_channels(3)
442             .output_channels_tile(4)
443             .output_channels(output_channels)
444             .input_width(input_width)
445             .input_height(input_height)
446             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
447         }
448       }
449     }
450   }
451 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,padding_top)452   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, padding_top) {
453     TEST_REQUIRES_ARM_NEON;
454     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
455       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
456         for (size_t input_width = 1; input_width < 32; input_width += 7) {
457           ConvHWCMicrokernelTester()
458             .kernel_size(3)
459             .subsampling(2)
460             .padding_width(1)
461             .padding_top(padding_top)
462             .input_channels(3)
463             .output_channels_tile(4)
464             .output_channels(output_channels)
465             .input_width(input_width)
466             .input_height(9)
467             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
468         }
469       }
470     }
471   }
472 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,padding_bottom)473   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, padding_bottom) {
474     TEST_REQUIRES_ARM_NEON;
475     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
476       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
477         for (size_t input_width = 1; input_width < 32; input_width += 7) {
478           ConvHWCMicrokernelTester()
479             .kernel_size(3)
480             .subsampling(2)
481             .padding_width(1)
482             .padding_bottom(padding_bottom)
483             .input_channels(3)
484             .output_channels_tile(4)
485             .output_channels(output_channels)
486             .input_width(input_width)
487             .input_height(9)
488             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
489         }
490       }
491     }
492   }
493 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,output_y_start)494   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, output_y_start) {
495     TEST_REQUIRES_ARM_NEON;
496     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
497       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
498         for (size_t input_width = 1; input_width < 32; input_width += 7) {
499           ConvHWCMicrokernelTester()
500             .kernel_size(3)
501             .subsampling(2)
502             .padding_width(1)
503             .input_channels(3)
504             .output_channels_tile(4)
505             .output_channels(output_channels)
506             .input_width(input_width)
507             .input_height(9)
508             .output_y_start(output_y_start)
509             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
510         }
511       }
512     }
513   }
514 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,output_y_end)515   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, output_y_end) {
516     TEST_REQUIRES_ARM_NEON;
517     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
518       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
519         for (size_t input_width = 1; input_width < 32; input_width += 7) {
520           ConvHWCMicrokernelTester()
521             .kernel_size(3)
522             .subsampling(2)
523             .padding_width(1)
524             .input_channels(3)
525             .output_channels_tile(4)
526             .output_channels(output_channels)
527             .input_width(input_width)
528             .input_height(9)
529             .output_y_end(output_y_end)
530             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
531         }
532       }
533     }
534   }
535 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,qmin)536   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, qmin) {
537     TEST_REQUIRES_ARM_NEON;
538     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
539       for (size_t input_width = 1; input_width < 32; input_width += 7) {
540         ConvHWCMicrokernelTester()
541           .kernel_size(3)
542           .subsampling(2)
543           .padding_width(1)
544           .input_channels(3)
545           .output_channels_tile(4)
546           .output_channels(output_channels)
547           .input_width(input_width)
548           .input_height(6)
549           .qmin(128)
550           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
551       }
552     }
553   }
554 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2,qmax)555   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X2, qmax) {
556     TEST_REQUIRES_ARM_NEON;
557     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
558       for (size_t input_width = 1; input_width < 32; input_width += 7) {
559         ConvHWCMicrokernelTester()
560           .kernel_size(3)
561           .subsampling(2)
562           .padding_width(1)
563           .input_channels(3)
564           .output_channels_tile(4)
565           .output_channels(output_channels)
566           .input_width(input_width)
567           .input_height(6)
568           .qmax(128)
569           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x2);
570       }
571     }
572   }
573 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
574 
575 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,input_width_eq_4)576   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, input_width_eq_4) {
577     TEST_REQUIRES_ARM_NEON;
578     ConvHWCMicrokernelTester()
579       .kernel_size(3)
580       .subsampling(2)
581       .padding_right(1)
582       .input_channels(3)
583       .output_channels_tile(8)
584       .output_channels(8)
585       .input_width(4)
586       .input_height(3)
587       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
588   }
589 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,input_width_div_4)590   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, input_width_div_4) {
591     TEST_REQUIRES_ARM_NEON;
592     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
593       ConvHWCMicrokernelTester()
594         .kernel_size(3)
595         .subsampling(2)
596         .padding_right(1)
597         .input_channels(3)
598         .output_channels_tile(8)
599         .output_channels(8)
600         .input_width(input_width)
601         .input_height(3)
602         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
603     }
604   }
605 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,input_width_lt_4)606   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, input_width_lt_4) {
607     TEST_REQUIRES_ARM_NEON;
608     for (size_t input_width = 2; input_width < 4; input_width++) {
609       ConvHWCMicrokernelTester()
610         .kernel_size(3)
611         .subsampling(2)
612         .padding_right(1)
613         .input_channels(3)
614         .output_channels_tile(8)
615         .output_channels(8)
616         .input_width(input_width)
617         .input_height(3)
618         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
619     }
620   }
621 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,input_width_gt_4)622   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, input_width_gt_4) {
623     TEST_REQUIRES_ARM_NEON;
624     for (size_t input_width = 5; input_width < 8; input_width++) {
625       ConvHWCMicrokernelTester()
626         .kernel_size(3)
627         .subsampling(2)
628         .padding_right(1)
629         .input_channels(3)
630         .output_channels_tile(8)
631         .output_channels(8)
632         .input_width(input_width)
633         .input_height(3)
634         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
635     }
636   }
637 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,output_channels_lt_8)638   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, output_channels_lt_8) {
639     TEST_REQUIRES_ARM_NEON;
640     for (size_t output_channels = 1; output_channels < 8; output_channels++) {
641       for (size_t input_width = 2; input_width < 32; input_width += 7) {
642         ConvHWCMicrokernelTester()
643           .kernel_size(3)
644           .subsampling(2)
645           .padding_right(1)
646           .input_channels(3)
647           .output_channels_tile(8)
648           .output_channels(output_channels)
649           .input_width(input_width)
650           .input_height(3)
651           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
652       }
653     }
654   }
655 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,output_channels_div_8)656   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, output_channels_div_8) {
657     TEST_REQUIRES_ARM_NEON;
658     for (size_t output_channels = 16; output_channels <= 32; output_channels += 8) {
659       for (size_t input_width = 2; input_width < 32; input_width += 7) {
660         ConvHWCMicrokernelTester()
661           .kernel_size(3)
662           .subsampling(2)
663           .padding_right(1)
664           .input_channels(3)
665           .output_channels_tile(8)
666           .output_channels(output_channels)
667           .input_width(input_width)
668           .input_height(3)
669           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
670       }
671     }
672   }
673 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,output_channels_gt_8)674   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, output_channels_gt_8) {
675     TEST_REQUIRES_ARM_NEON;
676     for (size_t output_channels = 9; output_channels < 16; output_channels++) {
677       for (size_t input_width = 2; input_width < 32; input_width += 7) {
678         ConvHWCMicrokernelTester()
679           .kernel_size(3)
680           .subsampling(2)
681           .padding_right(1)
682           .input_channels(3)
683           .output_channels_tile(8)
684           .output_channels(output_channels)
685           .input_width(input_width)
686           .input_height(3)
687           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
688       }
689     }
690   }
691 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,input_height_lt_3)692   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, input_height_lt_3) {
693     TEST_REQUIRES_ARM_NEON;
694     for (size_t input_height = 1; input_height < 3; input_height++) {
695       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
696         for (size_t input_width = 2; input_width < 32; input_width += 7) {
697           ConvHWCMicrokernelTester()
698             .kernel_size(3)
699             .subsampling(2)
700             .padding_height(1)
701             .padding_right(1)
702             .input_channels(3)
703             .output_channels_tile(8)
704             .output_channels(output_channels)
705             .input_width(input_width)
706             .input_height(input_height)
707             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
708         }
709       }
710     }
711   }
712 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,input_height_gt_3)713   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, input_height_gt_3) {
714     TEST_REQUIRES_ARM_NEON;
715     for (size_t input_height = 4; input_height <= 9; input_height++) {
716       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
717         for (size_t input_width = 2; input_width < 32; input_width += 7) {
718           ConvHWCMicrokernelTester()
719             .kernel_size(3)
720             .subsampling(2)
721             .padding_right(1)
722             .input_channels(3)
723             .output_channels_tile(8)
724             .output_channels(output_channels)
725             .input_width(input_width)
726             .input_height(input_height)
727             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
728         }
729       }
730     }
731   }
732 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,padding_top)733   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, padding_top) {
734     TEST_REQUIRES_ARM_NEON;
735     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
736       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
737         for (size_t input_width = 2; input_width < 32; input_width += 7) {
738           ConvHWCMicrokernelTester()
739             .kernel_size(3)
740             .subsampling(2)
741             .padding_right(1)
742             .padding_top(padding_top)
743             .input_channels(3)
744             .output_channels_tile(8)
745             .output_channels(output_channels)
746             .input_width(input_width)
747             .input_height(9)
748             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
749         }
750       }
751     }
752   }
753 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,padding_bottom)754   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, padding_bottom) {
755     TEST_REQUIRES_ARM_NEON;
756     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
757       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
758         for (size_t input_width = 2; input_width < 32; input_width += 7) {
759           ConvHWCMicrokernelTester()
760             .kernel_size(3)
761             .subsampling(2)
762             .padding_right(1)
763             .padding_bottom(padding_bottom)
764             .input_channels(3)
765             .output_channels_tile(8)
766             .output_channels(output_channels)
767             .input_width(input_width)
768             .input_height(9)
769             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
770         }
771       }
772     }
773   }
774 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,output_y_start)775   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, output_y_start) {
776     TEST_REQUIRES_ARM_NEON;
777     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
778       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
779         for (size_t input_width = 2; input_width < 32; input_width += 7) {
780           ConvHWCMicrokernelTester()
781             .kernel_size(3)
782             .subsampling(2)
783             .padding_right(1)
784             .input_channels(3)
785             .output_channels_tile(8)
786             .output_channels(output_channels)
787             .input_width(input_width)
788             .input_height(9)
789             .output_y_start(output_y_start)
790             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
791         }
792       }
793     }
794   }
795 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,output_y_end)796   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, output_y_end) {
797     TEST_REQUIRES_ARM_NEON;
798     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
799       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
800         for (size_t input_width = 2; input_width < 32; input_width += 7) {
801           ConvHWCMicrokernelTester()
802             .kernel_size(3)
803             .subsampling(2)
804             .padding_right(1)
805             .input_channels(3)
806             .output_channels_tile(8)
807             .output_channels(output_channels)
808             .input_width(input_width)
809             .input_height(9)
810             .output_y_end(output_y_end)
811             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
812         }
813       }
814     }
815   }
816 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,qmin)817   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, qmin) {
818     TEST_REQUIRES_ARM_NEON;
819     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
820       for (size_t input_width = 2; input_width < 32; input_width += 7) {
821         ConvHWCMicrokernelTester()
822           .kernel_size(3)
823           .subsampling(2)
824           .padding_right(1)
825           .input_channels(3)
826           .output_channels_tile(8)
827           .output_channels(output_channels)
828           .input_width(input_width)
829           .input_height(6)
830           .qmin(128)
831           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
832       }
833     }
834   }
835 
TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2,qmax)836   TEST(F32_CONV_3X3S2P0P1C3X8__NEON_2X2, qmax) {
837     TEST_REQUIRES_ARM_NEON;
838     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
839       for (size_t input_width = 2; input_width < 32; input_width += 7) {
840         ConvHWCMicrokernelTester()
841           .kernel_size(3)
842           .subsampling(2)
843           .padding_right(1)
844           .input_channels(3)
845           .output_channels_tile(8)
846           .output_channels(output_channels)
847           .input_width(input_width)
848           .input_height(6)
849           .qmax(128)
850           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neon_2x2);
851       }
852     }
853   }
854 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
855 
856 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,input_width_eq_4)857   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, input_width_eq_4) {
858     TEST_REQUIRES_ARM_NEON;
859     ConvHWCMicrokernelTester()
860       .kernel_size(3)
861       .subsampling(2)
862       .padding_right(1)
863       .input_channels(3)
864       .output_channels_tile(4)
865       .output_channels(4)
866       .input_width(5)
867       .input_height(3)
868       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
869   }
870 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,input_width_div_4)871   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, input_width_div_4) {
872     TEST_REQUIRES_ARM_NEON;
873     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
874       ConvHWCMicrokernelTester()
875         .kernel_size(3)
876         .subsampling(2)
877         .padding_right(1)
878         .input_channels(3)
879         .output_channels_tile(4)
880         .output_channels(4)
881         .input_width(input_width)
882         .input_height(3)
883         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
884     }
885   }
886 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,input_width_lt_4)887   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, input_width_lt_4) {
888     TEST_REQUIRES_ARM_NEON;
889     for (size_t input_width = 2; input_width < 4; input_width++) {
890       ConvHWCMicrokernelTester()
891         .kernel_size(3)
892         .subsampling(2)
893         .padding_right(1)
894         .input_channels(3)
895         .output_channels_tile(4)
896         .output_channels(4)
897         .input_width(input_width)
898         .input_height(3)
899         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
900     }
901   }
902 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,input_width_gt_4)903   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, input_width_gt_4) {
904     TEST_REQUIRES_ARM_NEON;
905     for (size_t input_width = 5; input_width < 8; input_width++) {
906       ConvHWCMicrokernelTester()
907         .kernel_size(3)
908         .subsampling(2)
909         .padding_right(1)
910         .input_channels(3)
911         .output_channels_tile(4)
912         .output_channels(4)
913         .input_width(input_width)
914         .input_height(3)
915         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
916     }
917   }
918 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,output_channels_lt_4)919   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, output_channels_lt_4) {
920     TEST_REQUIRES_ARM_NEON;
921     for (size_t output_channels = 1; output_channels < 4; output_channels++) {
922       for (size_t input_width = 2; input_width < 32; input_width += 7) {
923         ConvHWCMicrokernelTester()
924           .kernel_size(3)
925           .subsampling(2)
926           .padding_right(1)
927           .input_channels(3)
928           .output_channels_tile(4)
929           .output_channels(output_channels)
930           .input_width(input_width)
931           .input_height(3)
932           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
933       }
934     }
935   }
936 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,output_channels_div_4)937   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, output_channels_div_4) {
938     TEST_REQUIRES_ARM_NEON;
939     for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
940       for (size_t input_width = 2; input_width < 32; input_width += 7) {
941         ConvHWCMicrokernelTester()
942           .kernel_size(3)
943           .subsampling(2)
944           .padding_right(1)
945           .input_channels(3)
946           .output_channels_tile(4)
947           .output_channels(output_channels)
948           .input_width(input_width)
949           .input_height(3)
950           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
951       }
952     }
953   }
954 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,output_channels_gt_4)955   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, output_channels_gt_4) {
956     TEST_REQUIRES_ARM_NEON;
957     for (size_t output_channels = 5; output_channels < 8; output_channels++) {
958       for (size_t input_width = 2; input_width < 32; input_width += 7) {
959         ConvHWCMicrokernelTester()
960           .kernel_size(3)
961           .subsampling(2)
962           .padding_right(1)
963           .input_channels(3)
964           .output_channels_tile(4)
965           .output_channels(output_channels)
966           .input_width(input_width)
967           .input_height(3)
968           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
969       }
970     }
971   }
972 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,input_height_lt_3)973   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, input_height_lt_3) {
974     TEST_REQUIRES_ARM_NEON;
975     for (size_t input_height = 1; input_height < 3; input_height++) {
976       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
977         for (size_t input_width = 2; input_width < 32; input_width += 7) {
978           ConvHWCMicrokernelTester()
979             .kernel_size(3)
980             .subsampling(2)
981             .padding_right(1)
982             .padding_height(1) // padded input height of at least 3 required
983             .input_channels(3)
984             .output_channels_tile(4)
985             .output_channels(output_channels)
986             .input_width(input_width)
987             .input_height(input_height)
988             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
989         }
990       }
991     }
992   }
993 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,input_height_gt_3)994   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, input_height_gt_3) {
995     TEST_REQUIRES_ARM_NEON;
996     for (size_t input_height = 4; input_height <= 9; input_height++) {
997       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
998         for (size_t input_width = 2; input_width < 32; input_width += 7) {
999           ConvHWCMicrokernelTester()
1000             .kernel_size(3)
1001             .subsampling(2)
1002             .padding_right(1)
1003             .input_channels(3)
1004             .output_channels_tile(4)
1005             .output_channels(output_channels)
1006             .input_width(input_width)
1007             .input_height(input_height)
1008             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
1009         }
1010       }
1011     }
1012   }
1013 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,padding_top)1014   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, padding_top) {
1015     TEST_REQUIRES_ARM_NEON;
1016     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
1017       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1018         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1019           ConvHWCMicrokernelTester()
1020             .kernel_size(3)
1021             .subsampling(2)
1022             .padding_right(1)
1023             .padding_top(padding_top)
1024             .input_channels(3)
1025             .output_channels_tile(4)
1026             .output_channels(output_channels)
1027             .input_width(input_width)
1028             .input_height(9)
1029             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
1030         }
1031       }
1032     }
1033   }
1034 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,padding_bottom)1035   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, padding_bottom) {
1036     TEST_REQUIRES_ARM_NEON;
1037     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
1038       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1039         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1040           ConvHWCMicrokernelTester()
1041             .kernel_size(3)
1042             .subsampling(2)
1043             .padding_right(1)
1044             .padding_bottom(padding_bottom)
1045             .input_channels(3)
1046             .output_channels_tile(4)
1047             .output_channels(output_channels)
1048             .input_width(input_width)
1049             .input_height(9)
1050             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
1051         }
1052       }
1053     }
1054   }
1055 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,output_y_start)1056   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, output_y_start) {
1057     TEST_REQUIRES_ARM_NEON;
1058     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
1059       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1060         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1061           ConvHWCMicrokernelTester()
1062             .kernel_size(3)
1063             .subsampling(2)
1064             .padding_right(1)
1065             .input_channels(3)
1066             .output_channels_tile(4)
1067             .output_channels(output_channels)
1068             .input_width(input_width)
1069             .input_height(9)
1070             .output_y_start(output_y_start)
1071             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
1072         }
1073       }
1074     }
1075   }
1076 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,output_y_end)1077   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, output_y_end) {
1078     TEST_REQUIRES_ARM_NEON;
1079     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
1080       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1081         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1082           ConvHWCMicrokernelTester()
1083             .kernel_size(3)
1084             .subsampling(2)
1085             .padding_right(1)
1086             .input_channels(3)
1087             .output_channels_tile(4)
1088             .output_channels(output_channels)
1089             .input_width(input_width)
1090             .input_height(9)
1091             .output_y_end(output_y_end)
1092             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
1093         }
1094       }
1095     }
1096   }
1097 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,qmin)1098   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, qmin) {
1099     TEST_REQUIRES_ARM_NEON;
1100     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1101       for (size_t input_width = 2; input_width < 32; input_width += 7) {
1102         ConvHWCMicrokernelTester()
1103           .kernel_size(3)
1104           .subsampling(2)
1105           .padding_right(1)
1106           .input_channels(3)
1107           .output_channels_tile(4)
1108           .output_channels(output_channels)
1109           .input_width(input_width)
1110           .input_height(6)
1111           .qmin(128)
1112           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
1113       }
1114     }
1115   }
1116 
TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2,qmax)1117   TEST(F32_CONV_3X3S2P0P1C3X4__NEON_2X2, qmax) {
1118     TEST_REQUIRES_ARM_NEON;
1119     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1120       for (size_t input_width = 2; input_width < 32; input_width += 7) {
1121         ConvHWCMicrokernelTester()
1122           .kernel_size(3)
1123           .subsampling(2)
1124           .padding_right(1)
1125           .input_channels(3)
1126           .output_channels_tile(4)
1127           .output_channels(output_channels)
1128           .input_width(input_width)
1129           .input_height(6)
1130           .qmax(128)
1131           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neon_2x2);
1132       }
1133     }
1134   }
1135 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1136 
1137 #if XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,input_width_eq_4)1138   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, input_width_eq_4) {
1139     TEST_REQUIRES_ARM_NEON_FMA;
1140     ConvHWCMicrokernelTester()
1141       .kernel_size(3)
1142       .subsampling(2)
1143       .padding_width(1)
1144       .input_channels(3)
1145       .output_channels_tile(8)
1146       .output_channels(8)
1147       .input_width(4)
1148       .input_height(3)
1149       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1150   }
1151 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,input_width_div_4)1152   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, input_width_div_4) {
1153     TEST_REQUIRES_ARM_NEON_FMA;
1154     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
1155       ConvHWCMicrokernelTester()
1156         .kernel_size(3)
1157         .subsampling(2)
1158         .padding_width(1)
1159         .input_channels(3)
1160         .output_channels_tile(8)
1161         .output_channels(8)
1162         .input_width(input_width)
1163         .input_height(3)
1164         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1165     }
1166   }
1167 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,input_width_lt_4)1168   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, input_width_lt_4) {
1169     TEST_REQUIRES_ARM_NEON_FMA;
1170     for (size_t input_width = 1; input_width < 4; input_width++) {
1171       ConvHWCMicrokernelTester()
1172         .kernel_size(3)
1173         .subsampling(2)
1174         .padding_width(1)
1175         .input_channels(3)
1176         .output_channels_tile(8)
1177         .output_channels(8)
1178         .input_width(input_width)
1179         .input_height(3)
1180         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1181     }
1182   }
1183 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,input_width_gt_4)1184   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, input_width_gt_4) {
1185     TEST_REQUIRES_ARM_NEON_FMA;
1186     for (size_t input_width = 5; input_width < 8; input_width++) {
1187       ConvHWCMicrokernelTester()
1188         .kernel_size(3)
1189         .subsampling(2)
1190         .padding_width(1)
1191         .input_channels(3)
1192         .output_channels_tile(8)
1193         .output_channels(8)
1194         .input_width(input_width)
1195         .input_height(3)
1196         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1197     }
1198   }
1199 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,output_channels_lt_8)1200   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, output_channels_lt_8) {
1201     TEST_REQUIRES_ARM_NEON_FMA;
1202     for (size_t output_channels = 1; output_channels < 8; output_channels++) {
1203       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1204         ConvHWCMicrokernelTester()
1205           .kernel_size(3)
1206           .subsampling(2)
1207           .padding_width(1)
1208           .input_channels(3)
1209           .output_channels_tile(8)
1210           .output_channels(output_channels)
1211           .input_width(input_width)
1212           .input_height(3)
1213           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1214       }
1215     }
1216   }
1217 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,output_channels_div_8)1218   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, output_channels_div_8) {
1219     TEST_REQUIRES_ARM_NEON_FMA;
1220     for (size_t output_channels = 16; output_channels <= 32; output_channels += 8) {
1221       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1222         ConvHWCMicrokernelTester()
1223           .kernel_size(3)
1224           .subsampling(2)
1225           .padding_width(1)
1226           .input_channels(3)
1227           .output_channels_tile(8)
1228           .output_channels(output_channels)
1229           .input_width(input_width)
1230           .input_height(3)
1231           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1232       }
1233     }
1234   }
1235 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,output_channels_gt_8)1236   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, output_channels_gt_8) {
1237     TEST_REQUIRES_ARM_NEON_FMA;
1238     for (size_t output_channels = 9; output_channels < 16; output_channels++) {
1239       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1240         ConvHWCMicrokernelTester()
1241           .kernel_size(3)
1242           .subsampling(2)
1243           .padding_width(1)
1244           .input_channels(3)
1245           .output_channels_tile(8)
1246           .output_channels(output_channels)
1247           .input_width(input_width)
1248           .input_height(3)
1249           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1250       }
1251     }
1252   }
1253 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,input_height_lt_3)1254   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, input_height_lt_3) {
1255     TEST_REQUIRES_ARM_NEON_FMA;
1256     for (size_t input_height = 1; input_height < 3; input_height++) {
1257       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1258         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1259           ConvHWCMicrokernelTester()
1260             .kernel_size(3)
1261             .subsampling(2)
1262             .padding(1) // padded input height of at least 3 required
1263             .input_channels(3)
1264             .output_channels_tile(8)
1265             .output_channels(output_channels)
1266             .input_width(input_width)
1267             .input_height(input_height)
1268             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1269         }
1270       }
1271     }
1272   }
1273 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,input_height_gt_3)1274   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, input_height_gt_3) {
1275     TEST_REQUIRES_ARM_NEON_FMA;
1276     for (size_t input_height = 4; input_height <= 9; input_height++) {
1277       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1278         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1279           ConvHWCMicrokernelTester()
1280             .kernel_size(3)
1281             .subsampling(2)
1282             .padding_width(1)
1283             .input_channels(3)
1284             .output_channels_tile(8)
1285             .output_channels(output_channels)
1286             .input_width(input_width)
1287             .input_height(input_height)
1288             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1289         }
1290       }
1291     }
1292   }
1293 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,padding_top)1294   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, padding_top) {
1295     TEST_REQUIRES_ARM_NEON_FMA;
1296     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
1297       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1298         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1299           ConvHWCMicrokernelTester()
1300             .kernel_size(3)
1301             .subsampling(2)
1302             .padding_width(1)
1303             .padding_top(padding_top)
1304             .input_channels(3)
1305             .output_channels_tile(8)
1306             .output_channels(output_channels)
1307             .input_width(input_width)
1308             .input_height(9)
1309             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1310         }
1311       }
1312     }
1313   }
1314 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,padding_bottom)1315   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, padding_bottom) {
1316     TEST_REQUIRES_ARM_NEON_FMA;
1317     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
1318       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1319         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1320           ConvHWCMicrokernelTester()
1321             .kernel_size(3)
1322             .subsampling(2)
1323             .padding_width(1)
1324             .padding_bottom(padding_bottom)
1325             .input_channels(3)
1326             .output_channels_tile(8)
1327             .output_channels(output_channels)
1328             .input_width(input_width)
1329             .input_height(9)
1330             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1331         }
1332       }
1333     }
1334   }
1335 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,output_y_start)1336   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, output_y_start) {
1337     TEST_REQUIRES_ARM_NEON_FMA;
1338     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
1339       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1340         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1341           ConvHWCMicrokernelTester()
1342             .kernel_size(3)
1343             .subsampling(2)
1344             .padding_width(1)
1345             .input_channels(3)
1346             .output_channels_tile(8)
1347             .output_channels(output_channels)
1348             .input_width(input_width)
1349             .input_height(9)
1350             .output_y_start(output_y_start)
1351             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1352         }
1353       }
1354     }
1355   }
1356 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,output_y_end)1357   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, output_y_end) {
1358     TEST_REQUIRES_ARM_NEON_FMA;
1359     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
1360       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1361         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1362           ConvHWCMicrokernelTester()
1363             .kernel_size(3)
1364             .subsampling(2)
1365             .padding_width(1)
1366             .input_channels(3)
1367             .output_channels_tile(8)
1368             .output_channels(output_channels)
1369             .input_width(input_width)
1370             .input_height(9)
1371             .output_y_end(output_y_end)
1372             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1373         }
1374       }
1375     }
1376   }
1377 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,qmin)1378   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, qmin) {
1379     TEST_REQUIRES_ARM_NEON_FMA;
1380     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1381       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1382         ConvHWCMicrokernelTester()
1383           .kernel_size(3)
1384           .subsampling(2)
1385           .padding_width(1)
1386           .input_channels(3)
1387           .output_channels_tile(8)
1388           .output_channels(output_channels)
1389           .input_width(input_width)
1390           .input_height(6)
1391           .qmin(128)
1392           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1393       }
1394     }
1395   }
1396 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2,qmax)1397   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X2, qmax) {
1398     TEST_REQUIRES_ARM_NEON_FMA;
1399     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1400       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1401         ConvHWCMicrokernelTester()
1402           .kernel_size(3)
1403           .subsampling(2)
1404           .padding_width(1)
1405           .input_channels(3)
1406           .output_channels_tile(8)
1407           .output_channels(output_channels)
1408           .input_width(input_width)
1409           .input_height(6)
1410           .qmax(128)
1411           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x2);
1412       }
1413     }
1414   }
1415 #endif  // XNN_ARCH_ARM64
1416 
1417 #if XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,input_width_eq_4)1418   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, input_width_eq_4) {
1419     TEST_REQUIRES_ARM_NEON_FMA;
1420     ConvHWCMicrokernelTester()
1421       .kernel_size(3)
1422       .subsampling(2)
1423       .padding_width(1)
1424       .input_channels(3)
1425       .output_channels_tile(4)
1426       .output_channels(4)
1427       .input_width(4)
1428       .input_height(3)
1429       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1430   }
1431 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,input_width_div_4)1432   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, input_width_div_4) {
1433     TEST_REQUIRES_ARM_NEON_FMA;
1434     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
1435       ConvHWCMicrokernelTester()
1436         .kernel_size(3)
1437         .subsampling(2)
1438         .padding_width(1)
1439         .input_channels(3)
1440         .output_channels_tile(4)
1441         .output_channels(4)
1442         .input_width(input_width)
1443         .input_height(3)
1444         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1445     }
1446   }
1447 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,input_width_lt_4)1448   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, input_width_lt_4) {
1449     TEST_REQUIRES_ARM_NEON_FMA;
1450     for (size_t input_width = 1; input_width < 4; input_width++) {
1451       ConvHWCMicrokernelTester()
1452         .kernel_size(3)
1453         .subsampling(2)
1454         .padding_width(1)
1455         .input_channels(3)
1456         .output_channels_tile(4)
1457         .output_channels(4)
1458         .input_width(input_width)
1459         .input_height(3)
1460         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1461     }
1462   }
1463 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,input_width_gt_4)1464   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, input_width_gt_4) {
1465     TEST_REQUIRES_ARM_NEON_FMA;
1466     for (size_t input_width = 5; input_width < 8; input_width++) {
1467       ConvHWCMicrokernelTester()
1468         .kernel_size(3)
1469         .subsampling(2)
1470         .padding_width(1)
1471         .input_channels(3)
1472         .output_channels_tile(4)
1473         .output_channels(4)
1474         .input_width(input_width)
1475         .input_height(3)
1476         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1477     }
1478   }
1479 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,output_channels_lt_4)1480   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, output_channels_lt_4) {
1481     TEST_REQUIRES_ARM_NEON_FMA;
1482     for (size_t output_channels = 1; output_channels < 4; output_channels++) {
1483       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1484         ConvHWCMicrokernelTester()
1485           .kernel_size(3)
1486           .subsampling(2)
1487           .padding_width(1)
1488           .input_channels(3)
1489           .output_channels_tile(4)
1490           .output_channels(output_channels)
1491           .input_width(input_width)
1492           .input_height(3)
1493           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1494       }
1495     }
1496   }
1497 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,output_channels_div_4)1498   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, output_channels_div_4) {
1499     TEST_REQUIRES_ARM_NEON_FMA;
1500     for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
1501       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1502         ConvHWCMicrokernelTester()
1503           .kernel_size(3)
1504           .subsampling(2)
1505           .padding_width(1)
1506           .input_channels(3)
1507           .output_channels_tile(4)
1508           .output_channels(output_channels)
1509           .input_width(input_width)
1510           .input_height(3)
1511           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1512       }
1513     }
1514   }
1515 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,output_channels_gt_4)1516   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, output_channels_gt_4) {
1517     TEST_REQUIRES_ARM_NEON_FMA;
1518     for (size_t output_channels = 5; output_channels < 8; output_channels++) {
1519       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1520         ConvHWCMicrokernelTester()
1521           .kernel_size(3)
1522           .subsampling(2)
1523           .padding_width(1)
1524           .input_channels(3)
1525           .output_channels_tile(4)
1526           .output_channels(output_channels)
1527           .input_width(input_width)
1528           .input_height(3)
1529           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1530       }
1531     }
1532   }
1533 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,input_height_lt_3)1534   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, input_height_lt_3) {
1535     TEST_REQUIRES_ARM_NEON_FMA;
1536     for (size_t input_height = 1; input_height < 3; input_height++) {
1537       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1538         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1539           ConvHWCMicrokernelTester()
1540             .kernel_size(3)
1541             .subsampling(2)
1542             .padding(1)
1543             .input_channels(3) // padded input height of at least 3 required
1544             .output_channels_tile(4)
1545             .output_channels(output_channels)
1546             .input_width(input_width)
1547             .input_height(input_height)
1548             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1549         }
1550       }
1551     }
1552   }
1553 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,input_height_gt_3)1554   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, input_height_gt_3) {
1555     TEST_REQUIRES_ARM_NEON_FMA;
1556     for (size_t input_height = 4; input_height <= 9; input_height++) {
1557       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1558         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1559           ConvHWCMicrokernelTester()
1560             .kernel_size(3)
1561             .subsampling(2)
1562             .padding_width(1)
1563             .input_channels(3)
1564             .output_channels_tile(4)
1565             .output_channels(output_channels)
1566             .input_width(input_width)
1567             .input_height(input_height)
1568             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1569         }
1570       }
1571     }
1572   }
1573 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,padding_top)1574   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, padding_top) {
1575     TEST_REQUIRES_ARM_NEON_FMA;
1576     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
1577       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1578         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1579           ConvHWCMicrokernelTester()
1580             .kernel_size(3)
1581             .subsampling(2)
1582             .padding_width(1)
1583             .padding_top(padding_top)
1584             .input_channels(3)
1585             .output_channels_tile(4)
1586             .output_channels(output_channels)
1587             .input_width(input_width)
1588             .input_height(9)
1589             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1590         }
1591       }
1592     }
1593   }
1594 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,padding_bottom)1595   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, padding_bottom) {
1596     TEST_REQUIRES_ARM_NEON_FMA;
1597     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
1598       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1599         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1600           ConvHWCMicrokernelTester()
1601             .kernel_size(3)
1602             .subsampling(2)
1603             .padding_width(1)
1604             .padding_bottom(padding_bottom)
1605             .input_channels(3)
1606             .output_channels_tile(4)
1607             .output_channels(output_channels)
1608             .input_width(input_width)
1609             .input_height(9)
1610             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1611         }
1612       }
1613     }
1614   }
1615 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,output_y_start)1616   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, output_y_start) {
1617     TEST_REQUIRES_ARM_NEON_FMA;
1618     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
1619       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1620         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1621           ConvHWCMicrokernelTester()
1622             .kernel_size(3)
1623             .subsampling(2)
1624             .padding_width(1)
1625             .input_channels(3)
1626             .output_channels_tile(4)
1627             .output_channels(output_channels)
1628             .input_width(input_width)
1629             .input_height(9)
1630             .output_y_start(output_y_start)
1631             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1632         }
1633       }
1634     }
1635   }
1636 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,output_y_end)1637   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, output_y_end) {
1638     TEST_REQUIRES_ARM_NEON_FMA;
1639     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
1640       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1641         for (size_t input_width = 1; input_width < 32; input_width += 7) {
1642           ConvHWCMicrokernelTester()
1643             .kernel_size(3)
1644             .subsampling(2)
1645             .padding_width(1)
1646             .input_channels(3)
1647             .output_channels_tile(4)
1648             .output_channels(output_channels)
1649             .input_width(input_width)
1650             .input_height(9)
1651             .output_y_end(output_y_end)
1652             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1653         }
1654       }
1655     }
1656   }
1657 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,qmin)1658   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, qmin) {
1659     TEST_REQUIRES_ARM_NEON_FMA;
1660     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1661       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1662         ConvHWCMicrokernelTester()
1663           .kernel_size(3)
1664           .subsampling(2)
1665           .padding_width(1)
1666           .input_channels(3)
1667           .output_channels_tile(4)
1668           .output_channels(output_channels)
1669           .input_width(input_width)
1670           .input_height(6)
1671           .qmin(128)
1672           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1673       }
1674     }
1675   }
1676 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2,qmax)1677   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X2, qmax) {
1678     TEST_REQUIRES_ARM_NEON_FMA;
1679     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
1680       for (size_t input_width = 1; input_width < 32; input_width += 7) {
1681         ConvHWCMicrokernelTester()
1682           .kernel_size(3)
1683           .subsampling(2)
1684           .padding_width(1)
1685           .input_channels(3)
1686           .output_channels_tile(4)
1687           .output_channels(output_channels)
1688           .input_width(input_width)
1689           .input_height(6)
1690           .qmax(128)
1691           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x2);
1692       }
1693     }
1694   }
1695 #endif  // XNN_ARCH_ARM64
1696 
1697 #if XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,input_width_eq_4)1698   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, input_width_eq_4) {
1699     TEST_REQUIRES_ARM_NEON_FMA;
1700     ConvHWCMicrokernelTester()
1701       .kernel_size(3)
1702       .subsampling(2)
1703       .padding_right(1)
1704       .input_channels(3)
1705       .output_channels_tile(8)
1706       .output_channels(8)
1707       .input_width(4)
1708       .input_height(3)
1709       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1710   }
1711 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,input_width_div_4)1712   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, input_width_div_4) {
1713     TEST_REQUIRES_ARM_NEON_FMA;
1714     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
1715       ConvHWCMicrokernelTester()
1716         .kernel_size(3)
1717         .subsampling(2)
1718         .padding_right(1)
1719         .input_channels(3)
1720         .output_channels_tile(8)
1721         .output_channels(8)
1722         .input_width(input_width)
1723         .input_height(3)
1724         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1725     }
1726   }
1727 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,input_width_lt_4)1728   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, input_width_lt_4) {
1729     TEST_REQUIRES_ARM_NEON_FMA;
1730     for (size_t input_width = 2; input_width < 4; input_width++) {
1731       ConvHWCMicrokernelTester()
1732         .kernel_size(3)
1733         .subsampling(2)
1734         .padding_right(1)
1735         .input_channels(3)
1736         .output_channels_tile(8)
1737         .output_channels(8)
1738         .input_width(input_width)
1739         .input_height(3)
1740         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1741     }
1742   }
1743 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,input_width_gt_4)1744   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, input_width_gt_4) {
1745     TEST_REQUIRES_ARM_NEON_FMA;
1746     for (size_t input_width = 5; input_width < 8; input_width++) {
1747       ConvHWCMicrokernelTester()
1748         .kernel_size(3)
1749         .subsampling(2)
1750         .padding_right(1)
1751         .input_channels(3)
1752         .output_channels_tile(8)
1753         .output_channels(8)
1754         .input_width(input_width)
1755         .input_height(3)
1756         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1757     }
1758   }
1759 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,output_channels_lt_8)1760   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, output_channels_lt_8) {
1761     TEST_REQUIRES_ARM_NEON_FMA;
1762     for (size_t output_channels = 1; output_channels < 8; output_channels++) {
1763       for (size_t input_width = 2; input_width < 32; input_width += 7) {
1764         ConvHWCMicrokernelTester()
1765           .kernel_size(3)
1766           .subsampling(2)
1767           .padding_right(1)
1768           .input_channels(3)
1769           .output_channels_tile(8)
1770           .output_channels(output_channels)
1771           .input_width(input_width)
1772           .input_height(3)
1773           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1774       }
1775     }
1776   }
1777 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,output_channels_div_8)1778   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, output_channels_div_8) {
1779     TEST_REQUIRES_ARM_NEON_FMA;
1780     for (size_t output_channels = 16; output_channels <= 32; output_channels += 8) {
1781       for (size_t input_width = 2; input_width < 32; input_width += 7) {
1782         ConvHWCMicrokernelTester()
1783           .kernel_size(3)
1784           .subsampling(2)
1785           .padding_right(1)
1786           .input_channels(3)
1787           .output_channels_tile(8)
1788           .output_channels(output_channels)
1789           .input_width(input_width)
1790           .input_height(3)
1791           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1792       }
1793     }
1794   }
1795 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,output_channels_gt_8)1796   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, output_channels_gt_8) {
1797     TEST_REQUIRES_ARM_NEON_FMA;
1798     for (size_t output_channels = 9; output_channels < 16; output_channels++) {
1799       for (size_t input_width = 2; input_width < 32; input_width += 7) {
1800         ConvHWCMicrokernelTester()
1801           .kernel_size(3)
1802           .subsampling(2)
1803           .padding_right(1)
1804           .input_channels(3)
1805           .output_channels_tile(8)
1806           .output_channels(output_channels)
1807           .input_width(input_width)
1808           .input_height(3)
1809           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1810       }
1811     }
1812   }
1813 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,input_height_lt_3)1814   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, input_height_lt_3) {
1815     TEST_REQUIRES_ARM_NEON_FMA;
1816     for (size_t input_height = 1; input_height < 3; input_height++) {
1817       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1818         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1819           ConvHWCMicrokernelTester()
1820             .kernel_size(3)
1821             .subsampling(2)
1822             .padding_height(1)
1823             .padding_right(1)
1824             .input_channels(3)
1825             .output_channels_tile(8)
1826             .output_channels(output_channels)
1827             .input_width(input_width)
1828             .input_height(input_height)
1829             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1830         }
1831       }
1832     }
1833   }
1834 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,input_height_gt_3)1835   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, input_height_gt_3) {
1836     TEST_REQUIRES_ARM_NEON_FMA;
1837     for (size_t input_height = 4; input_height <= 9; input_height++) {
1838       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1839         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1840           ConvHWCMicrokernelTester()
1841             .kernel_size(3)
1842             .subsampling(2)
1843             .padding_right(1)
1844             .input_channels(3)
1845             .output_channels_tile(8)
1846             .output_channels(output_channels)
1847             .input_width(input_width)
1848             .input_height(input_height)
1849             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1850         }
1851       }
1852     }
1853   }
1854 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,padding_top)1855   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, padding_top) {
1856     TEST_REQUIRES_ARM_NEON_FMA;
1857     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
1858       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1859         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1860           ConvHWCMicrokernelTester()
1861             .kernel_size(3)
1862             .subsampling(2)
1863             .padding_right(1)
1864             .padding_top(padding_top)
1865             .input_channels(3)
1866             .output_channels_tile(8)
1867             .output_channels(output_channels)
1868             .input_width(input_width)
1869             .input_height(9)
1870             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1871         }
1872       }
1873     }
1874   }
1875 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,padding_bottom)1876   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, padding_bottom) {
1877     TEST_REQUIRES_ARM_NEON_FMA;
1878     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
1879       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1880         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1881           ConvHWCMicrokernelTester()
1882             .kernel_size(3)
1883             .subsampling(2)
1884             .padding_right(1)
1885             .padding_bottom(padding_bottom)
1886             .input_channels(3)
1887             .output_channels_tile(8)
1888             .output_channels(output_channels)
1889             .input_width(input_width)
1890             .input_height(9)
1891             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1892         }
1893       }
1894     }
1895   }
1896 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,output_y_start)1897   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, output_y_start) {
1898     TEST_REQUIRES_ARM_NEON_FMA;
1899     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
1900       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1901         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1902           ConvHWCMicrokernelTester()
1903             .kernel_size(3)
1904             .subsampling(2)
1905             .padding_right(1)
1906             .input_channels(3)
1907             .output_channels_tile(8)
1908             .output_channels(output_channels)
1909             .input_width(input_width)
1910             .input_height(9)
1911             .output_y_start(output_y_start)
1912             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1913         }
1914       }
1915     }
1916   }
1917 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,output_y_end)1918   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, output_y_end) {
1919     TEST_REQUIRES_ARM_NEON_FMA;
1920     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
1921       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1922         for (size_t input_width = 2; input_width < 32; input_width += 7) {
1923           ConvHWCMicrokernelTester()
1924             .kernel_size(3)
1925             .subsampling(2)
1926             .padding_right(1)
1927             .input_channels(3)
1928             .output_channels_tile(8)
1929             .output_channels(output_channels)
1930             .input_width(input_width)
1931             .input_height(9)
1932             .output_y_end(output_y_end)
1933             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1934         }
1935       }
1936     }
1937   }
1938 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,qmin)1939   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, qmin) {
1940     TEST_REQUIRES_ARM_NEON_FMA;
1941     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1942       for (size_t input_width = 2; input_width < 32; input_width += 7) {
1943         ConvHWCMicrokernelTester()
1944           .kernel_size(3)
1945           .subsampling(2)
1946           .padding_right(1)
1947           .input_channels(3)
1948           .output_channels_tile(8)
1949           .output_channels(output_channels)
1950           .input_width(input_width)
1951           .input_height(6)
1952           .qmin(128)
1953           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1954       }
1955     }
1956   }
1957 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2,qmax)1958   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X2, qmax) {
1959     TEST_REQUIRES_ARM_NEON_FMA;
1960     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
1961       for (size_t input_width = 2; input_width < 32; input_width += 7) {
1962         ConvHWCMicrokernelTester()
1963           .kernel_size(3)
1964           .subsampling(2)
1965           .padding_right(1)
1966           .input_channels(3)
1967           .output_channels_tile(8)
1968           .output_channels(output_channels)
1969           .input_width(input_width)
1970           .input_height(6)
1971           .qmax(128)
1972           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x2);
1973       }
1974     }
1975   }
1976 #endif  // XNN_ARCH_ARM64
1977 
1978 #if XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,input_width_eq_4)1979   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, input_width_eq_4) {
1980     TEST_REQUIRES_ARM_NEON_FMA;
1981     ConvHWCMicrokernelTester()
1982       .kernel_size(3)
1983       .subsampling(2)
1984       .padding_right(1)
1985       .input_channels(3)
1986       .output_channels_tile(4)
1987       .output_channels(4)
1988       .input_width(5)
1989       .input_height(3)
1990       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
1991   }
1992 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,input_width_div_4)1993   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, input_width_div_4) {
1994     TEST_REQUIRES_ARM_NEON_FMA;
1995     for (size_t input_width = 8; input_width <= 32; input_width += 12) {
1996       ConvHWCMicrokernelTester()
1997         .kernel_size(3)
1998         .subsampling(2)
1999         .padding_right(1)
2000         .input_channels(3)
2001         .output_channels_tile(4)
2002         .output_channels(4)
2003         .input_width(input_width)
2004         .input_height(3)
2005         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2006     }
2007   }
2008 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,input_width_lt_4)2009   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, input_width_lt_4) {
2010     TEST_REQUIRES_ARM_NEON_FMA;
2011     for (size_t input_width = 2; input_width < 4; input_width++) {
2012       ConvHWCMicrokernelTester()
2013         .kernel_size(3)
2014         .subsampling(2)
2015         .padding_right(1)
2016         .input_channels(3)
2017         .output_channels_tile(4)
2018         .output_channels(4)
2019         .input_width(input_width)
2020         .input_height(3)
2021         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2022     }
2023   }
2024 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,input_width_gt_4)2025   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, input_width_gt_4) {
2026     TEST_REQUIRES_ARM_NEON_FMA;
2027     for (size_t input_width = 5; input_width < 8; input_width++) {
2028       ConvHWCMicrokernelTester()
2029         .kernel_size(3)
2030         .subsampling(2)
2031         .padding_right(1)
2032         .input_channels(3)
2033         .output_channels_tile(4)
2034         .output_channels(4)
2035         .input_width(input_width)
2036         .input_height(3)
2037         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2038     }
2039   }
2040 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,output_channels_lt_4)2041   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, output_channels_lt_4) {
2042     TEST_REQUIRES_ARM_NEON_FMA;
2043     for (size_t output_channels = 1; output_channels < 4; output_channels++) {
2044       for (size_t input_width = 2; input_width < 32; input_width += 7) {
2045         ConvHWCMicrokernelTester()
2046           .kernel_size(3)
2047           .subsampling(2)
2048           .padding_right(1)
2049           .input_channels(3)
2050           .output_channels_tile(4)
2051           .output_channels(output_channels)
2052           .input_width(input_width)
2053           .input_height(3)
2054           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2055       }
2056     }
2057   }
2058 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,output_channels_div_4)2059   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, output_channels_div_4) {
2060     TEST_REQUIRES_ARM_NEON_FMA;
2061     for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
2062       for (size_t input_width = 2; input_width < 32; input_width += 7) {
2063         ConvHWCMicrokernelTester()
2064           .kernel_size(3)
2065           .subsampling(2)
2066           .padding_right(1)
2067           .input_channels(3)
2068           .output_channels_tile(4)
2069           .output_channels(output_channels)
2070           .input_width(input_width)
2071           .input_height(3)
2072           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2073       }
2074     }
2075   }
2076 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,output_channels_gt_4)2077   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, output_channels_gt_4) {
2078     TEST_REQUIRES_ARM_NEON_FMA;
2079     for (size_t output_channels = 5; output_channels < 8; output_channels++) {
2080       for (size_t input_width = 2; input_width < 32; input_width += 7) {
2081         ConvHWCMicrokernelTester()
2082           .kernel_size(3)
2083           .subsampling(2)
2084           .padding_right(1)
2085           .input_channels(3)
2086           .output_channels_tile(4)
2087           .output_channels(output_channels)
2088           .input_width(input_width)
2089           .input_height(3)
2090           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2091       }
2092     }
2093   }
2094 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,input_height_lt_3)2095   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, input_height_lt_3) {
2096     TEST_REQUIRES_ARM_NEON_FMA;
2097     for (size_t input_height = 1; input_height < 3; input_height++) {
2098       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2099         for (size_t input_width = 2; input_width < 32; input_width += 7) {
2100           ConvHWCMicrokernelTester()
2101             .kernel_size(3)
2102             .subsampling(2)
2103             .padding_right(1)
2104             .padding_height(1) // padded input height of at least 3 required
2105             .input_channels(3)
2106             .output_channels_tile(4)
2107             .output_channels(output_channels)
2108             .input_width(input_width)
2109             .input_height(input_height)
2110             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2111         }
2112       }
2113     }
2114   }
2115 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,input_height_gt_3)2116   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, input_height_gt_3) {
2117     TEST_REQUIRES_ARM_NEON_FMA;
2118     for (size_t input_height = 4; input_height <= 9; input_height++) {
2119       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2120         for (size_t input_width = 2; input_width < 32; input_width += 7) {
2121           ConvHWCMicrokernelTester()
2122             .kernel_size(3)
2123             .subsampling(2)
2124             .padding_right(1)
2125             .input_channels(3)
2126             .output_channels_tile(4)
2127             .output_channels(output_channels)
2128             .input_width(input_width)
2129             .input_height(input_height)
2130             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2131         }
2132       }
2133     }
2134   }
2135 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,padding_top)2136   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, padding_top) {
2137     TEST_REQUIRES_ARM_NEON_FMA;
2138     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
2139       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2140         for (size_t input_width = 2; input_width < 32; input_width += 7) {
2141           ConvHWCMicrokernelTester()
2142             .kernel_size(3)
2143             .subsampling(2)
2144             .padding_right(1)
2145             .padding_top(padding_top)
2146             .input_channels(3)
2147             .output_channels_tile(4)
2148             .output_channels(output_channels)
2149             .input_width(input_width)
2150             .input_height(9)
2151             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2152         }
2153       }
2154     }
2155   }
2156 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,padding_bottom)2157   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, padding_bottom) {
2158     TEST_REQUIRES_ARM_NEON_FMA;
2159     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
2160       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2161         for (size_t input_width = 2; input_width < 32; input_width += 7) {
2162           ConvHWCMicrokernelTester()
2163             .kernel_size(3)
2164             .subsampling(2)
2165             .padding_right(1)
2166             .padding_bottom(padding_bottom)
2167             .input_channels(3)
2168             .output_channels_tile(4)
2169             .output_channels(output_channels)
2170             .input_width(input_width)
2171             .input_height(9)
2172             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2173         }
2174       }
2175     }
2176   }
2177 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,output_y_start)2178   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, output_y_start) {
2179     TEST_REQUIRES_ARM_NEON_FMA;
2180     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
2181       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2182         for (size_t input_width = 2; input_width < 32; input_width += 7) {
2183           ConvHWCMicrokernelTester()
2184             .kernel_size(3)
2185             .subsampling(2)
2186             .padding_right(1)
2187             .input_channels(3)
2188             .output_channels_tile(4)
2189             .output_channels(output_channels)
2190             .input_width(input_width)
2191             .input_height(9)
2192             .output_y_start(output_y_start)
2193             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2194         }
2195       }
2196     }
2197   }
2198 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,output_y_end)2199   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, output_y_end) {
2200     TEST_REQUIRES_ARM_NEON_FMA;
2201     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
2202       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2203         for (size_t input_width = 2; input_width < 32; input_width += 7) {
2204           ConvHWCMicrokernelTester()
2205             .kernel_size(3)
2206             .subsampling(2)
2207             .padding_right(1)
2208             .input_channels(3)
2209             .output_channels_tile(4)
2210             .output_channels(output_channels)
2211             .input_width(input_width)
2212             .input_height(9)
2213             .output_y_end(output_y_end)
2214             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2215         }
2216       }
2217     }
2218   }
2219 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,qmin)2220   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, qmin) {
2221     TEST_REQUIRES_ARM_NEON_FMA;
2222     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2223       for (size_t input_width = 2; input_width < 32; input_width += 7) {
2224         ConvHWCMicrokernelTester()
2225           .kernel_size(3)
2226           .subsampling(2)
2227           .padding_right(1)
2228           .input_channels(3)
2229           .output_channels_tile(4)
2230           .output_channels(output_channels)
2231           .input_width(input_width)
2232           .input_height(6)
2233           .qmin(128)
2234           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2235       }
2236     }
2237   }
2238 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2,qmax)2239   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X2, qmax) {
2240     TEST_REQUIRES_ARM_NEON_FMA;
2241     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2242       for (size_t input_width = 2; input_width < 32; input_width += 7) {
2243         ConvHWCMicrokernelTester()
2244           .kernel_size(3)
2245           .subsampling(2)
2246           .padding_right(1)
2247           .input_channels(3)
2248           .output_channels_tile(4)
2249           .output_channels(output_channels)
2250           .input_width(input_width)
2251           .input_height(6)
2252           .qmax(128)
2253           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x2);
2254       }
2255     }
2256   }
2257 #endif  // XNN_ARCH_ARM64
2258 
2259 #if XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,input_width_eq_2)2260   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, input_width_eq_2) {
2261     TEST_REQUIRES_ARM_NEON_FMA;
2262     ConvHWCMicrokernelTester()
2263       .kernel_size(3)
2264       .subsampling(2)
2265       .padding_right(1)
2266       .input_channels(3)
2267       .output_channels_tile(8)
2268       .output_channels(8)
2269       .input_width(2)
2270       .input_height(3)
2271       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2272   }
2273 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,input_width_div_2)2274   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, input_width_div_2) {
2275     TEST_REQUIRES_ARM_NEON_FMA;
2276     for (size_t input_width = 4; input_width <= 16; input_width += 6) {
2277       ConvHWCMicrokernelTester()
2278         .kernel_size(3)
2279         .subsampling(2)
2280         .padding_right(1)
2281         .input_channels(3)
2282         .output_channels_tile(8)
2283         .output_channels(8)
2284         .input_width(input_width)
2285         .input_height(3)
2286         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2287     }
2288   }
2289 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,input_width_gt_2)2290   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, input_width_gt_2) {
2291     TEST_REQUIRES_ARM_NEON_FMA;
2292     for (size_t input_width = 3; input_width < 4; input_width++) {
2293       ConvHWCMicrokernelTester()
2294         .kernel_size(3)
2295         .subsampling(2)
2296         .padding_right(1)
2297         .input_channels(3)
2298         .output_channels_tile(8)
2299         .output_channels(8)
2300         .input_width(input_width)
2301         .input_height(3)
2302         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2303     }
2304   }
2305 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,output_channels_lt_8)2306   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, output_channels_lt_8) {
2307     TEST_REQUIRES_ARM_NEON_FMA;
2308     for (size_t output_channels = 1; output_channels < 8; output_channels++) {
2309       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2310         ConvHWCMicrokernelTester()
2311           .kernel_size(3)
2312           .subsampling(2)
2313           .padding_right(1)
2314           .input_channels(3)
2315           .output_channels_tile(8)
2316           .output_channels(output_channels)
2317           .input_width(input_width)
2318           .input_height(3)
2319           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2320       }
2321     }
2322   }
2323 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,output_channels_div_8)2324   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, output_channels_div_8) {
2325     TEST_REQUIRES_ARM_NEON_FMA;
2326     for (size_t output_channels = 16; output_channels <= 32; output_channels += 8) {
2327       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2328         ConvHWCMicrokernelTester()
2329           .kernel_size(3)
2330           .subsampling(2)
2331           .padding_right(1)
2332           .input_channels(3)
2333           .output_channels_tile(8)
2334           .output_channels(output_channels)
2335           .input_width(input_width)
2336           .input_height(3)
2337           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2338       }
2339     }
2340   }
2341 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,output_channels_gt_8)2342   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, output_channels_gt_8) {
2343     TEST_REQUIRES_ARM_NEON_FMA;
2344     for (size_t output_channels = 9; output_channels < 16; output_channels++) {
2345       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2346         ConvHWCMicrokernelTester()
2347           .kernel_size(3)
2348           .subsampling(2)
2349           .padding_right(1)
2350           .input_channels(3)
2351           .output_channels_tile(8)
2352           .output_channels(output_channels)
2353           .input_width(input_width)
2354           .input_height(3)
2355           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2356       }
2357     }
2358   }
2359 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,input_height_lt_3)2360   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, input_height_lt_3) {
2361     TEST_REQUIRES_ARM_NEON_FMA;
2362     for (size_t input_height = 1; input_height < 3; input_height++) {
2363       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2364         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2365           ConvHWCMicrokernelTester()
2366             .kernel_size(3)
2367             .subsampling(2)
2368             .padding_height(1)
2369             .padding_right(1)
2370             .input_channels(3)
2371             .output_channels_tile(8)
2372             .output_channels(output_channels)
2373             .input_width(input_width)
2374             .input_height(input_height)
2375             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2376         }
2377       }
2378     }
2379   }
2380 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,input_height_gt_3)2381   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, input_height_gt_3) {
2382     TEST_REQUIRES_ARM_NEON_FMA;
2383     for (size_t input_height = 4; input_height <= 9; input_height++) {
2384       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2385         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2386           ConvHWCMicrokernelTester()
2387             .kernel_size(3)
2388             .subsampling(2)
2389             .padding_right(1)
2390             .input_channels(3)
2391             .output_channels_tile(8)
2392             .output_channels(output_channels)
2393             .input_width(input_width)
2394             .input_height(input_height)
2395             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2396         }
2397       }
2398     }
2399   }
2400 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,padding_top)2401   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, padding_top) {
2402     TEST_REQUIRES_ARM_NEON_FMA;
2403     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
2404       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2405         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2406           ConvHWCMicrokernelTester()
2407             .kernel_size(3)
2408             .subsampling(2)
2409             .padding_right(1)
2410             .padding_top(padding_top)
2411             .input_channels(3)
2412             .output_channels_tile(8)
2413             .output_channels(output_channels)
2414             .input_width(input_width)
2415             .input_height(9)
2416             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2417         }
2418       }
2419     }
2420   }
2421 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,padding_bottom)2422   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, padding_bottom) {
2423     TEST_REQUIRES_ARM_NEON_FMA;
2424     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
2425       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2426         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2427           ConvHWCMicrokernelTester()
2428             .kernel_size(3)
2429             .subsampling(2)
2430             .padding_right(1)
2431             .padding_bottom(padding_bottom)
2432             .input_channels(3)
2433             .output_channels_tile(8)
2434             .output_channels(output_channels)
2435             .input_width(input_width)
2436             .input_height(9)
2437             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2438         }
2439       }
2440     }
2441   }
2442 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,output_y_start)2443   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, output_y_start) {
2444     TEST_REQUIRES_ARM_NEON_FMA;
2445     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
2446       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2447         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2448           ConvHWCMicrokernelTester()
2449             .kernel_size(3)
2450             .subsampling(2)
2451             .padding_right(1)
2452             .input_channels(3)
2453             .output_channels_tile(8)
2454             .output_channels(output_channels)
2455             .input_width(input_width)
2456             .input_height(9)
2457             .output_y_start(output_y_start)
2458             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2459         }
2460       }
2461     }
2462   }
2463 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,output_y_end)2464   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, output_y_end) {
2465     TEST_REQUIRES_ARM_NEON_FMA;
2466     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
2467       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2468         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2469           ConvHWCMicrokernelTester()
2470             .kernel_size(3)
2471             .subsampling(2)
2472             .padding_right(1)
2473             .input_channels(3)
2474             .output_channels_tile(8)
2475             .output_channels(output_channels)
2476             .input_width(input_width)
2477             .input_height(9)
2478             .output_y_end(output_y_end)
2479             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2480         }
2481       }
2482     }
2483   }
2484 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,qmin)2485   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, qmin) {
2486     TEST_REQUIRES_ARM_NEON_FMA;
2487     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2488       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2489         ConvHWCMicrokernelTester()
2490           .kernel_size(3)
2491           .subsampling(2)
2492           .padding_right(1)
2493           .input_channels(3)
2494           .output_channels_tile(8)
2495           .output_channels(output_channels)
2496           .input_width(input_width)
2497           .input_height(6)
2498           .qmin(128)
2499           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2500       }
2501     }
2502   }
2503 
TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1,qmax)2504   TEST(F32_CONV_3X3S2P0P1C3X8__NEONFMA_2X1, qmax) {
2505     TEST_REQUIRES_ARM_NEON_FMA;
2506     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2507       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2508         ConvHWCMicrokernelTester()
2509           .kernel_size(3)
2510           .subsampling(2)
2511           .padding_right(1)
2512           .input_channels(3)
2513           .output_channels_tile(8)
2514           .output_channels(output_channels)
2515           .input_width(input_width)
2516           .input_height(6)
2517           .qmax(128)
2518           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x8__neonfma_2x1);
2519       }
2520     }
2521   }
2522 #endif  // XNN_ARCH_ARM64
2523 
2524 #if XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,input_width_eq_2)2525   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, input_width_eq_2) {
2526     TEST_REQUIRES_ARM_NEON_FMA;
2527     ConvHWCMicrokernelTester()
2528       .kernel_size(3)
2529       .subsampling(2)
2530       .padding_right(1)
2531       .input_channels(3)
2532       .output_channels_tile(4)
2533       .output_channels(4)
2534       .input_width(2)
2535       .input_height(3)
2536       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2537   }
2538 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,input_width_div_2)2539   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, input_width_div_2) {
2540     TEST_REQUIRES_ARM_NEON_FMA;
2541     for (size_t input_width = 4; input_width <= 16; input_width += 6) {
2542       ConvHWCMicrokernelTester()
2543         .kernel_size(3)
2544         .subsampling(2)
2545         .padding_right(1)
2546         .input_channels(3)
2547         .output_channels_tile(4)
2548         .output_channels(4)
2549         .input_width(input_width)
2550         .input_height(3)
2551         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2552     }
2553   }
2554 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,input_width_gt_2)2555   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, input_width_gt_2) {
2556     TEST_REQUIRES_ARM_NEON_FMA;
2557     for (size_t input_width = 3; input_width < 4; input_width++) {
2558       ConvHWCMicrokernelTester()
2559         .kernel_size(3)
2560         .subsampling(2)
2561         .padding_right(1)
2562         .input_channels(3)
2563         .output_channels_tile(4)
2564         .output_channels(4)
2565         .input_width(input_width)
2566         .input_height(3)
2567         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2568     }
2569   }
2570 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,output_channels_lt_4)2571   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, output_channels_lt_4) {
2572     TEST_REQUIRES_ARM_NEON_FMA;
2573     for (size_t output_channels = 1; output_channels < 4; output_channels++) {
2574       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2575         ConvHWCMicrokernelTester()
2576           .kernel_size(3)
2577           .subsampling(2)
2578           .padding_right(1)
2579           .input_channels(3)
2580           .output_channels_tile(4)
2581           .output_channels(output_channels)
2582           .input_width(input_width)
2583           .input_height(3)
2584           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2585       }
2586     }
2587   }
2588 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,output_channels_div_4)2589   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, output_channels_div_4) {
2590     TEST_REQUIRES_ARM_NEON_FMA;
2591     for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
2592       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2593         ConvHWCMicrokernelTester()
2594           .kernel_size(3)
2595           .subsampling(2)
2596           .padding_right(1)
2597           .input_channels(3)
2598           .output_channels_tile(4)
2599           .output_channels(output_channels)
2600           .input_width(input_width)
2601           .input_height(3)
2602           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2603       }
2604     }
2605   }
2606 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,output_channels_gt_4)2607   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, output_channels_gt_4) {
2608     TEST_REQUIRES_ARM_NEON_FMA;
2609     for (size_t output_channels = 5; output_channels < 8; output_channels++) {
2610       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2611         ConvHWCMicrokernelTester()
2612           .kernel_size(3)
2613           .subsampling(2)
2614           .padding_right(1)
2615           .input_channels(3)
2616           .output_channels_tile(4)
2617           .output_channels(output_channels)
2618           .input_width(input_width)
2619           .input_height(3)
2620           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2621       }
2622     }
2623   }
2624 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,input_height_lt_3)2625   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, input_height_lt_3) {
2626     TEST_REQUIRES_ARM_NEON_FMA;
2627     for (size_t input_height = 1; input_height < 3; input_height++) {
2628       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2629         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2630           ConvHWCMicrokernelTester()
2631             .kernel_size(3)
2632             .subsampling(2)
2633             .padding_right(1)
2634             .padding_height(1) // padded input height of at least 3 required
2635             .input_channels(3)
2636             .output_channels_tile(4)
2637             .output_channels(output_channels)
2638             .input_width(input_width)
2639             .input_height(input_height)
2640             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2641         }
2642       }
2643     }
2644   }
2645 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,input_height_gt_3)2646   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, input_height_gt_3) {
2647     TEST_REQUIRES_ARM_NEON_FMA;
2648     for (size_t input_height = 4; input_height <= 9; input_height++) {
2649       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2650         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2651           ConvHWCMicrokernelTester()
2652             .kernel_size(3)
2653             .subsampling(2)
2654             .padding_right(1)
2655             .input_channels(3)
2656             .output_channels_tile(4)
2657             .output_channels(output_channels)
2658             .input_width(input_width)
2659             .input_height(input_height)
2660             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2661         }
2662       }
2663     }
2664   }
2665 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,padding_top)2666   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, padding_top) {
2667     TEST_REQUIRES_ARM_NEON_FMA;
2668     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
2669       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2670         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2671           ConvHWCMicrokernelTester()
2672             .kernel_size(3)
2673             .subsampling(2)
2674             .padding_right(1)
2675             .padding_top(padding_top)
2676             .input_channels(3)
2677             .output_channels_tile(4)
2678             .output_channels(output_channels)
2679             .input_width(input_width)
2680             .input_height(9)
2681             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2682         }
2683       }
2684     }
2685   }
2686 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,padding_bottom)2687   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, padding_bottom) {
2688     TEST_REQUIRES_ARM_NEON_FMA;
2689     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
2690       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2691         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2692           ConvHWCMicrokernelTester()
2693             .kernel_size(3)
2694             .subsampling(2)
2695             .padding_right(1)
2696             .padding_bottom(padding_bottom)
2697             .input_channels(3)
2698             .output_channels_tile(4)
2699             .output_channels(output_channels)
2700             .input_width(input_width)
2701             .input_height(9)
2702             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2703         }
2704       }
2705     }
2706   }
2707 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,output_y_start)2708   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, output_y_start) {
2709     TEST_REQUIRES_ARM_NEON_FMA;
2710     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
2711       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2712         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2713           ConvHWCMicrokernelTester()
2714             .kernel_size(3)
2715             .subsampling(2)
2716             .padding_right(1)
2717             .input_channels(3)
2718             .output_channels_tile(4)
2719             .output_channels(output_channels)
2720             .input_width(input_width)
2721             .input_height(9)
2722             .output_y_start(output_y_start)
2723             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2724         }
2725       }
2726     }
2727   }
2728 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,output_y_end)2729   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, output_y_end) {
2730     TEST_REQUIRES_ARM_NEON_FMA;
2731     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
2732       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2733         for (size_t input_width = 2; input_width < 16; input_width += 3) {
2734           ConvHWCMicrokernelTester()
2735             .kernel_size(3)
2736             .subsampling(2)
2737             .padding_right(1)
2738             .input_channels(3)
2739             .output_channels_tile(4)
2740             .output_channels(output_channels)
2741             .input_width(input_width)
2742             .input_height(9)
2743             .output_y_end(output_y_end)
2744             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2745         }
2746       }
2747     }
2748   }
2749 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,qmin)2750   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, qmin) {
2751     TEST_REQUIRES_ARM_NEON_FMA;
2752     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2753       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2754         ConvHWCMicrokernelTester()
2755           .kernel_size(3)
2756           .subsampling(2)
2757           .padding_right(1)
2758           .input_channels(3)
2759           .output_channels_tile(4)
2760           .output_channels(output_channels)
2761           .input_width(input_width)
2762           .input_height(6)
2763           .qmin(128)
2764           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2765       }
2766     }
2767   }
2768 
TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1,qmax)2769   TEST(F32_CONV_3X3S2P0P1C3X4__NEONFMA_2X1, qmax) {
2770     TEST_REQUIRES_ARM_NEON_FMA;
2771     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
2772       for (size_t input_width = 2; input_width < 16; input_width += 3) {
2773         ConvHWCMicrokernelTester()
2774           .kernel_size(3)
2775           .subsampling(2)
2776           .padding_right(1)
2777           .input_channels(3)
2778           .output_channels_tile(4)
2779           .output_channels(output_channels)
2780           .input_width(input_width)
2781           .input_height(6)
2782           .qmax(128)
2783           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__neonfma_2x1);
2784       }
2785     }
2786   }
2787 #endif  // XNN_ARCH_ARM64
2788 
2789 #if XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,input_width_eq_2)2790   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, input_width_eq_2) {
2791     TEST_REQUIRES_ARM_NEON_FMA;
2792     ConvHWCMicrokernelTester()
2793       .kernel_size(3)
2794       .subsampling(2)
2795       .padding_width(1)
2796       .input_channels(3)
2797       .output_channels_tile(8)
2798       .output_channels(8)
2799       .input_width(2)
2800       .input_height(3)
2801       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2802   }
2803 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,input_width_div_2)2804   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, input_width_div_2) {
2805     TEST_REQUIRES_ARM_NEON_FMA;
2806     for (size_t input_width = 4; input_width <= 16; input_width += 6) {
2807       ConvHWCMicrokernelTester()
2808         .kernel_size(3)
2809         .subsampling(2)
2810         .padding_width(1)
2811         .input_channels(3)
2812         .output_channels_tile(8)
2813         .output_channels(8)
2814         .input_width(input_width)
2815         .input_height(3)
2816         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2817     }
2818   }
2819 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,input_width_gt_2)2820   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, input_width_gt_2) {
2821     TEST_REQUIRES_ARM_NEON_FMA;
2822     for (size_t input_width = 3; input_width < 4; input_width++) {
2823       ConvHWCMicrokernelTester()
2824         .kernel_size(3)
2825         .subsampling(2)
2826         .padding_width(1)
2827         .input_channels(3)
2828         .output_channels_tile(8)
2829         .output_channels(8)
2830         .input_width(input_width)
2831         .input_height(3)
2832         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2833     }
2834   }
2835 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,output_channels_lt_8)2836   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, output_channels_lt_8) {
2837     TEST_REQUIRES_ARM_NEON_FMA;
2838     for (size_t output_channels = 1; output_channels < 8; output_channels++) {
2839       for (size_t input_width = 1; input_width < 16; input_width += 3) {
2840         ConvHWCMicrokernelTester()
2841           .kernel_size(3)
2842           .subsampling(2)
2843           .padding_width(1)
2844           .input_channels(3)
2845           .output_channels_tile(8)
2846           .output_channels(output_channels)
2847           .input_width(input_width)
2848           .input_height(3)
2849           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2850       }
2851     }
2852   }
2853 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,output_channels_div_8)2854   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, output_channels_div_8) {
2855     TEST_REQUIRES_ARM_NEON_FMA;
2856     for (size_t output_channels = 16; output_channels <= 32; output_channels += 8) {
2857       for (size_t input_width = 1; input_width < 16; input_width += 3) {
2858         ConvHWCMicrokernelTester()
2859           .kernel_size(3)
2860           .subsampling(2)
2861           .padding_width(1)
2862           .input_channels(3)
2863           .output_channels_tile(8)
2864           .output_channels(output_channels)
2865           .input_width(input_width)
2866           .input_height(3)
2867           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2868       }
2869     }
2870   }
2871 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,output_channels_gt_8)2872   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, output_channels_gt_8) {
2873     TEST_REQUIRES_ARM_NEON_FMA;
2874     for (size_t output_channels = 9; output_channels < 16; output_channels++) {
2875       for (size_t input_width = 1; input_width < 16; input_width += 3) {
2876         ConvHWCMicrokernelTester()
2877           .kernel_size(3)
2878           .subsampling(2)
2879           .padding_width(1)
2880           .input_channels(3)
2881           .output_channels_tile(8)
2882           .output_channels(output_channels)
2883           .input_width(input_width)
2884           .input_height(3)
2885           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2886       }
2887     }
2888   }
2889 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,input_height_lt_3)2890   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, input_height_lt_3) {
2891     TEST_REQUIRES_ARM_NEON_FMA;
2892     for (size_t input_height = 1; input_height < 3; input_height++) {
2893       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2894         for (size_t input_width = 1; input_width < 16; input_width += 3) {
2895           ConvHWCMicrokernelTester()
2896             .kernel_size(3)
2897             .subsampling(2)
2898             .padding_height(1)
2899             .padding_width(1)
2900             .input_channels(3)
2901             .output_channels_tile(8)
2902             .output_channels(output_channels)
2903             .input_width(input_width)
2904             .input_height(input_height)
2905             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2906         }
2907       }
2908     }
2909   }
2910 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,input_height_gt_3)2911   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, input_height_gt_3) {
2912     TEST_REQUIRES_ARM_NEON_FMA;
2913     for (size_t input_height = 4; input_height <= 9; input_height++) {
2914       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2915         for (size_t input_width = 1; input_width < 16; input_width += 3) {
2916           ConvHWCMicrokernelTester()
2917             .kernel_size(3)
2918             .subsampling(2)
2919             .padding_width(1)
2920             .input_channels(3)
2921             .output_channels_tile(8)
2922             .output_channels(output_channels)
2923             .input_width(input_width)
2924             .input_height(input_height)
2925             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2926         }
2927       }
2928     }
2929   }
2930 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,padding_top)2931   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, padding_top) {
2932     TEST_REQUIRES_ARM_NEON_FMA;
2933     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
2934       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2935         for (size_t input_width = 1; input_width < 16; input_width += 3) {
2936           ConvHWCMicrokernelTester()
2937             .kernel_size(3)
2938             .subsampling(2)
2939             .padding_width(1)
2940             .padding_top(padding_top)
2941             .input_channels(3)
2942             .output_channels_tile(8)
2943             .output_channels(output_channels)
2944             .input_width(input_width)
2945             .input_height(9)
2946             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2947         }
2948       }
2949     }
2950   }
2951 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,padding_bottom)2952   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, padding_bottom) {
2953     TEST_REQUIRES_ARM_NEON_FMA;
2954     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
2955       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2956         for (size_t input_width = 1; input_width < 16; input_width += 3) {
2957           ConvHWCMicrokernelTester()
2958             .kernel_size(3)
2959             .subsampling(2)
2960             .padding_width(1)
2961             .padding_bottom(padding_bottom)
2962             .input_channels(3)
2963             .output_channels_tile(8)
2964             .output_channels(output_channels)
2965             .input_width(input_width)
2966             .input_height(9)
2967             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2968         }
2969       }
2970     }
2971   }
2972 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,output_y_start)2973   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, output_y_start) {
2974     TEST_REQUIRES_ARM_NEON_FMA;
2975     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
2976       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2977         for (size_t input_width = 1; input_width < 16; input_width += 3) {
2978           ConvHWCMicrokernelTester()
2979             .kernel_size(3)
2980             .subsampling(2)
2981             .padding_width(1)
2982             .input_channels(3)
2983             .output_channels_tile(8)
2984             .output_channels(output_channels)
2985             .input_width(input_width)
2986             .input_height(9)
2987             .output_y_start(output_y_start)
2988             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
2989         }
2990       }
2991     }
2992   }
2993 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,output_y_end)2994   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, output_y_end) {
2995     TEST_REQUIRES_ARM_NEON_FMA;
2996     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
2997       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
2998         for (size_t input_width = 1; input_width < 16; input_width += 3) {
2999           ConvHWCMicrokernelTester()
3000             .kernel_size(3)
3001             .subsampling(2)
3002             .padding_width(1)
3003             .input_channels(3)
3004             .output_channels_tile(8)
3005             .output_channels(output_channels)
3006             .input_width(input_width)
3007             .input_height(9)
3008             .output_y_end(output_y_end)
3009             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
3010         }
3011       }
3012     }
3013   }
3014 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,qmin)3015   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, qmin) {
3016     TEST_REQUIRES_ARM_NEON_FMA;
3017     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3018       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3019         ConvHWCMicrokernelTester()
3020           .kernel_size(3)
3021           .subsampling(2)
3022           .padding_width(1)
3023           .input_channels(3)
3024           .output_channels_tile(8)
3025           .output_channels(output_channels)
3026           .input_width(input_width)
3027           .input_height(6)
3028           .qmin(128)
3029           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
3030       }
3031     }
3032   }
3033 
TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1,qmax)3034   TEST(F32_CONV_3X3S2P1C3X8__NEONFMA_2X1, qmax) {
3035     TEST_REQUIRES_ARM_NEON_FMA;
3036     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3037       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3038         ConvHWCMicrokernelTester()
3039           .kernel_size(3)
3040           .subsampling(2)
3041           .padding_width(1)
3042           .input_channels(3)
3043           .output_channels_tile(8)
3044           .output_channels(output_channels)
3045           .input_width(input_width)
3046           .input_height(6)
3047           .qmax(128)
3048           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neonfma_2x1);
3049       }
3050     }
3051   }
3052 #endif  // XNN_ARCH_ARM64
3053 
3054 #if XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,input_width_eq_2)3055   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, input_width_eq_2) {
3056     TEST_REQUIRES_ARM_NEON_FMA;
3057     ConvHWCMicrokernelTester()
3058       .kernel_size(3)
3059       .subsampling(2)
3060       .padding_width(1)
3061       .input_channels(3)
3062       .output_channels_tile(4)
3063       .output_channels(4)
3064       .input_width(2)
3065       .input_height(3)
3066       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3067   }
3068 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,input_width_div_2)3069   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, input_width_div_2) {
3070     TEST_REQUIRES_ARM_NEON_FMA;
3071     for (size_t input_width = 4; input_width <= 16; input_width += 6) {
3072       ConvHWCMicrokernelTester()
3073         .kernel_size(3)
3074         .subsampling(2)
3075         .padding_width(1)
3076         .input_channels(3)
3077         .output_channels_tile(4)
3078         .output_channels(4)
3079         .input_width(input_width)
3080         .input_height(3)
3081         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3082     }
3083   }
3084 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,input_width_gt_2)3085   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, input_width_gt_2) {
3086     TEST_REQUIRES_ARM_NEON_FMA;
3087     for (size_t input_width = 3; input_width < 4; input_width++) {
3088       ConvHWCMicrokernelTester()
3089         .kernel_size(3)
3090         .subsampling(2)
3091         .padding_width(1)
3092         .input_channels(3)
3093         .output_channels_tile(4)
3094         .output_channels(4)
3095         .input_width(input_width)
3096         .input_height(3)
3097         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3098     }
3099   }
3100 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,output_channels_lt_4)3101   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, output_channels_lt_4) {
3102     TEST_REQUIRES_ARM_NEON_FMA;
3103     for (size_t output_channels = 1; output_channels < 4; output_channels++) {
3104       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3105         ConvHWCMicrokernelTester()
3106           .kernel_size(3)
3107           .subsampling(2)
3108           .padding_width(1)
3109           .input_channels(3)
3110           .output_channels_tile(4)
3111           .output_channels(output_channels)
3112           .input_width(input_width)
3113           .input_height(3)
3114           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3115       }
3116     }
3117   }
3118 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,output_channels_div_4)3119   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, output_channels_div_4) {
3120     TEST_REQUIRES_ARM_NEON_FMA;
3121     for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
3122       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3123         ConvHWCMicrokernelTester()
3124           .kernel_size(3)
3125           .subsampling(2)
3126           .padding_width(1)
3127           .input_channels(3)
3128           .output_channels_tile(4)
3129           .output_channels(output_channels)
3130           .input_width(input_width)
3131           .input_height(3)
3132           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3133       }
3134     }
3135   }
3136 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,output_channels_gt_4)3137   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, output_channels_gt_4) {
3138     TEST_REQUIRES_ARM_NEON_FMA;
3139     for (size_t output_channels = 5; output_channels < 8; output_channels++) {
3140       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3141         ConvHWCMicrokernelTester()
3142           .kernel_size(3)
3143           .subsampling(2)
3144           .padding_width(1)
3145           .input_channels(3)
3146           .output_channels_tile(4)
3147           .output_channels(output_channels)
3148           .input_width(input_width)
3149           .input_height(3)
3150           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3151       }
3152     }
3153   }
3154 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,input_height_lt_3)3155   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, input_height_lt_3) {
3156     TEST_REQUIRES_ARM_NEON_FMA;
3157     for (size_t input_height = 1; input_height < 3; input_height++) {
3158       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3159         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3160           ConvHWCMicrokernelTester()
3161             .kernel_size(3)
3162             .subsampling(2)
3163             .padding_width(1)
3164             .padding_height(1) // padded input height of at least 3 required
3165             .input_channels(3)
3166             .output_channels_tile(4)
3167             .output_channels(output_channels)
3168             .input_width(input_width)
3169             .input_height(input_height)
3170             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3171         }
3172       }
3173     }
3174   }
3175 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,input_height_gt_3)3176   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, input_height_gt_3) {
3177     TEST_REQUIRES_ARM_NEON_FMA;
3178     for (size_t input_height = 4; input_height <= 9; input_height++) {
3179       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3180         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3181           ConvHWCMicrokernelTester()
3182             .kernel_size(3)
3183             .subsampling(2)
3184             .padding_width(1)
3185             .input_channels(3)
3186             .output_channels_tile(4)
3187             .output_channels(output_channels)
3188             .input_width(input_width)
3189             .input_height(input_height)
3190             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3191         }
3192       }
3193     }
3194   }
3195 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,padding_top)3196   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, padding_top) {
3197     TEST_REQUIRES_ARM_NEON_FMA;
3198     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
3199       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3200         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3201           ConvHWCMicrokernelTester()
3202             .kernel_size(3)
3203             .subsampling(2)
3204             .padding_width(1)
3205             .padding_top(padding_top)
3206             .input_channels(3)
3207             .output_channels_tile(4)
3208             .output_channels(output_channels)
3209             .input_width(input_width)
3210             .input_height(9)
3211             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3212         }
3213       }
3214     }
3215   }
3216 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,padding_bottom)3217   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, padding_bottom) {
3218     TEST_REQUIRES_ARM_NEON_FMA;
3219     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
3220       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3221         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3222           ConvHWCMicrokernelTester()
3223             .kernel_size(3)
3224             .subsampling(2)
3225             .padding_width(1)
3226             .padding_bottom(padding_bottom)
3227             .input_channels(3)
3228             .output_channels_tile(4)
3229             .output_channels(output_channels)
3230             .input_width(input_width)
3231             .input_height(9)
3232             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3233         }
3234       }
3235     }
3236   }
3237 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,output_y_start)3238   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, output_y_start) {
3239     TEST_REQUIRES_ARM_NEON_FMA;
3240     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
3241       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3242         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3243           ConvHWCMicrokernelTester()
3244             .kernel_size(3)
3245             .subsampling(2)
3246             .padding_width(1)
3247             .input_channels(3)
3248             .output_channels_tile(4)
3249             .output_channels(output_channels)
3250             .input_width(input_width)
3251             .input_height(9)
3252             .output_y_start(output_y_start)
3253             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3254         }
3255       }
3256     }
3257   }
3258 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,output_y_end)3259   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, output_y_end) {
3260     TEST_REQUIRES_ARM_NEON_FMA;
3261     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
3262       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3263         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3264           ConvHWCMicrokernelTester()
3265             .kernel_size(3)
3266             .subsampling(2)
3267             .padding_width(1)
3268             .input_channels(3)
3269             .output_channels_tile(4)
3270             .output_channels(output_channels)
3271             .input_width(input_width)
3272             .input_height(9)
3273             .output_y_end(output_y_end)
3274             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3275         }
3276       }
3277     }
3278   }
3279 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,qmin)3280   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, qmin) {
3281     TEST_REQUIRES_ARM_NEON_FMA;
3282     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3283       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3284         ConvHWCMicrokernelTester()
3285           .kernel_size(3)
3286           .subsampling(2)
3287           .padding_width(1)
3288           .input_channels(3)
3289           .output_channels_tile(4)
3290           .output_channels(output_channels)
3291           .input_width(input_width)
3292           .input_height(6)
3293           .qmin(128)
3294           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3295       }
3296     }
3297   }
3298 
TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1,qmax)3299   TEST(F32_CONV_3X3S2P1C3X4__NEONFMA_2X1, qmax) {
3300     TEST_REQUIRES_ARM_NEON_FMA;
3301     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3302       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3303         ConvHWCMicrokernelTester()
3304           .kernel_size(3)
3305           .subsampling(2)
3306           .padding_width(1)
3307           .input_channels(3)
3308           .output_channels_tile(4)
3309           .output_channels(output_channels)
3310           .input_width(input_width)
3311           .input_height(6)
3312           .qmax(128)
3313           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neonfma_2x1);
3314       }
3315     }
3316   }
3317 #endif  // XNN_ARCH_ARM64
3318 
3319 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,input_width_eq_2)3320   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, input_width_eq_2) {
3321     TEST_REQUIRES_ARM_NEON;
3322     ConvHWCMicrokernelTester()
3323       .kernel_size(3)
3324       .subsampling(2)
3325       .padding_width(1)
3326       .input_channels(3)
3327       .output_channels_tile(8)
3328       .output_channels(8)
3329       .input_width(2)
3330       .input_height(3)
3331       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3332   }
3333 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,input_width_div_2)3334   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, input_width_div_2) {
3335     TEST_REQUIRES_ARM_NEON;
3336     for (size_t input_width = 4; input_width <= 16; input_width += 6) {
3337       ConvHWCMicrokernelTester()
3338         .kernel_size(3)
3339         .subsampling(2)
3340         .padding_width(1)
3341         .input_channels(3)
3342         .output_channels_tile(8)
3343         .output_channels(8)
3344         .input_width(input_width)
3345         .input_height(3)
3346         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3347     }
3348   }
3349 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,input_width_gt_2)3350   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, input_width_gt_2) {
3351     TEST_REQUIRES_ARM_NEON;
3352     for (size_t input_width = 3; input_width < 4; input_width++) {
3353       ConvHWCMicrokernelTester()
3354         .kernel_size(3)
3355         .subsampling(2)
3356         .padding_width(1)
3357         .input_channels(3)
3358         .output_channels_tile(8)
3359         .output_channels(8)
3360         .input_width(input_width)
3361         .input_height(3)
3362         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3363     }
3364   }
3365 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,output_channels_lt_8)3366   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, output_channels_lt_8) {
3367     TEST_REQUIRES_ARM_NEON;
3368     for (size_t output_channels = 1; output_channels < 8; output_channels++) {
3369       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3370         ConvHWCMicrokernelTester()
3371           .kernel_size(3)
3372           .subsampling(2)
3373           .padding_width(1)
3374           .input_channels(3)
3375           .output_channels_tile(8)
3376           .output_channels(output_channels)
3377           .input_width(input_width)
3378           .input_height(3)
3379           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3380       }
3381     }
3382   }
3383 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,output_channels_div_8)3384   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, output_channels_div_8) {
3385     TEST_REQUIRES_ARM_NEON;
3386     for (size_t output_channels = 16; output_channels <= 32; output_channels += 8) {
3387       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3388         ConvHWCMicrokernelTester()
3389           .kernel_size(3)
3390           .subsampling(2)
3391           .padding_width(1)
3392           .input_channels(3)
3393           .output_channels_tile(8)
3394           .output_channels(output_channels)
3395           .input_width(input_width)
3396           .input_height(3)
3397           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3398       }
3399     }
3400   }
3401 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,output_channels_gt_8)3402   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, output_channels_gt_8) {
3403     TEST_REQUIRES_ARM_NEON;
3404     for (size_t output_channels = 9; output_channels < 16; output_channels++) {
3405       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3406         ConvHWCMicrokernelTester()
3407           .kernel_size(3)
3408           .subsampling(2)
3409           .padding_width(1)
3410           .input_channels(3)
3411           .output_channels_tile(8)
3412           .output_channels(output_channels)
3413           .input_width(input_width)
3414           .input_height(3)
3415           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3416       }
3417     }
3418   }
3419 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,input_height_lt_3)3420   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, input_height_lt_3) {
3421     TEST_REQUIRES_ARM_NEON;
3422     for (size_t input_height = 1; input_height < 3; input_height++) {
3423       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3424         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3425           ConvHWCMicrokernelTester()
3426             .kernel_size(3)
3427             .subsampling(2)
3428             .padding_height(1)
3429             .padding_width(1)
3430             .input_channels(3)
3431             .output_channels_tile(8)
3432             .output_channels(output_channels)
3433             .input_width(input_width)
3434             .input_height(input_height)
3435             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3436         }
3437       }
3438     }
3439   }
3440 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,input_height_gt_3)3441   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, input_height_gt_3) {
3442     TEST_REQUIRES_ARM_NEON;
3443     for (size_t input_height = 4; input_height <= 9; input_height++) {
3444       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3445         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3446           ConvHWCMicrokernelTester()
3447             .kernel_size(3)
3448             .subsampling(2)
3449             .padding_width(1)
3450             .input_channels(3)
3451             .output_channels_tile(8)
3452             .output_channels(output_channels)
3453             .input_width(input_width)
3454             .input_height(input_height)
3455             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3456         }
3457       }
3458     }
3459   }
3460 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,padding_top)3461   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, padding_top) {
3462     TEST_REQUIRES_ARM_NEON;
3463     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
3464       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3465         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3466           ConvHWCMicrokernelTester()
3467             .kernel_size(3)
3468             .subsampling(2)
3469             .padding_width(1)
3470             .padding_top(padding_top)
3471             .input_channels(3)
3472             .output_channels_tile(8)
3473             .output_channels(output_channels)
3474             .input_width(input_width)
3475             .input_height(9)
3476             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3477         }
3478       }
3479     }
3480   }
3481 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,padding_bottom)3482   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, padding_bottom) {
3483     TEST_REQUIRES_ARM_NEON;
3484     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
3485       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3486         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3487           ConvHWCMicrokernelTester()
3488             .kernel_size(3)
3489             .subsampling(2)
3490             .padding_width(1)
3491             .padding_bottom(padding_bottom)
3492             .input_channels(3)
3493             .output_channels_tile(8)
3494             .output_channels(output_channels)
3495             .input_width(input_width)
3496             .input_height(9)
3497             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3498         }
3499       }
3500     }
3501   }
3502 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,output_y_start)3503   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, output_y_start) {
3504     TEST_REQUIRES_ARM_NEON;
3505     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
3506       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3507         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3508           ConvHWCMicrokernelTester()
3509             .kernel_size(3)
3510             .subsampling(2)
3511             .padding_width(1)
3512             .input_channels(3)
3513             .output_channels_tile(8)
3514             .output_channels(output_channels)
3515             .input_width(input_width)
3516             .input_height(9)
3517             .output_y_start(output_y_start)
3518             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3519         }
3520       }
3521     }
3522   }
3523 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,output_y_end)3524   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, output_y_end) {
3525     TEST_REQUIRES_ARM_NEON;
3526     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
3527       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3528         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3529           ConvHWCMicrokernelTester()
3530             .kernel_size(3)
3531             .subsampling(2)
3532             .padding_width(1)
3533             .input_channels(3)
3534             .output_channels_tile(8)
3535             .output_channels(output_channels)
3536             .input_width(input_width)
3537             .input_height(9)
3538             .output_y_end(output_y_end)
3539             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3540         }
3541       }
3542     }
3543   }
3544 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,qmin)3545   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, qmin) {
3546     TEST_REQUIRES_ARM_NEON;
3547     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3548       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3549         ConvHWCMicrokernelTester()
3550           .kernel_size(3)
3551           .subsampling(2)
3552           .padding_width(1)
3553           .input_channels(3)
3554           .output_channels_tile(8)
3555           .output_channels(output_channels)
3556           .input_width(input_width)
3557           .input_height(6)
3558           .qmin(128)
3559           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3560       }
3561     }
3562   }
3563 
TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1,qmax)3564   TEST(F32_CONV_3X3S2P1C3X8__NEON_2X1, qmax) {
3565     TEST_REQUIRES_ARM_NEON;
3566     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3567       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3568         ConvHWCMicrokernelTester()
3569           .kernel_size(3)
3570           .subsampling(2)
3571           .padding_width(1)
3572           .input_channels(3)
3573           .output_channels_tile(8)
3574           .output_channels(output_channels)
3575           .input_width(input_width)
3576           .input_height(6)
3577           .qmax(128)
3578           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x8__neon_2x1);
3579       }
3580     }
3581   }
3582 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3583 
3584 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,input_width_eq_2)3585   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, input_width_eq_2) {
3586     TEST_REQUIRES_ARM_NEON;
3587     ConvHWCMicrokernelTester()
3588       .kernel_size(3)
3589       .subsampling(2)
3590       .padding_width(1)
3591       .input_channels(3)
3592       .output_channels_tile(4)
3593       .output_channels(4)
3594       .input_width(2)
3595       .input_height(3)
3596       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3597   }
3598 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,input_width_div_2)3599   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, input_width_div_2) {
3600     TEST_REQUIRES_ARM_NEON;
3601     for (size_t input_width = 4; input_width <= 16; input_width += 6) {
3602       ConvHWCMicrokernelTester()
3603         .kernel_size(3)
3604         .subsampling(2)
3605         .padding_width(1)
3606         .input_channels(3)
3607         .output_channels_tile(4)
3608         .output_channels(4)
3609         .input_width(input_width)
3610         .input_height(3)
3611         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3612     }
3613   }
3614 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,input_width_gt_2)3615   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, input_width_gt_2) {
3616     TEST_REQUIRES_ARM_NEON;
3617     for (size_t input_width = 3; input_width < 4; input_width++) {
3618       ConvHWCMicrokernelTester()
3619         .kernel_size(3)
3620         .subsampling(2)
3621         .padding_width(1)
3622         .input_channels(3)
3623         .output_channels_tile(4)
3624         .output_channels(4)
3625         .input_width(input_width)
3626         .input_height(3)
3627         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3628     }
3629   }
3630 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,output_channels_lt_4)3631   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, output_channels_lt_4) {
3632     TEST_REQUIRES_ARM_NEON;
3633     for (size_t output_channels = 1; output_channels < 4; output_channels++) {
3634       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3635         ConvHWCMicrokernelTester()
3636           .kernel_size(3)
3637           .subsampling(2)
3638           .padding_width(1)
3639           .input_channels(3)
3640           .output_channels_tile(4)
3641           .output_channels(output_channels)
3642           .input_width(input_width)
3643           .input_height(3)
3644           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3645       }
3646     }
3647   }
3648 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,output_channels_div_4)3649   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, output_channels_div_4) {
3650     TEST_REQUIRES_ARM_NEON;
3651     for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
3652       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3653         ConvHWCMicrokernelTester()
3654           .kernel_size(3)
3655           .subsampling(2)
3656           .padding_width(1)
3657           .input_channels(3)
3658           .output_channels_tile(4)
3659           .output_channels(output_channels)
3660           .input_width(input_width)
3661           .input_height(3)
3662           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3663       }
3664     }
3665   }
3666 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,output_channels_gt_4)3667   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, output_channels_gt_4) {
3668     TEST_REQUIRES_ARM_NEON;
3669     for (size_t output_channels = 5; output_channels < 8; output_channels++) {
3670       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3671         ConvHWCMicrokernelTester()
3672           .kernel_size(3)
3673           .subsampling(2)
3674           .padding_width(1)
3675           .input_channels(3)
3676           .output_channels_tile(4)
3677           .output_channels(output_channels)
3678           .input_width(input_width)
3679           .input_height(3)
3680           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3681       }
3682     }
3683   }
3684 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,input_height_lt_3)3685   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, input_height_lt_3) {
3686     TEST_REQUIRES_ARM_NEON;
3687     for (size_t input_height = 1; input_height < 3; input_height++) {
3688       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3689         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3690           ConvHWCMicrokernelTester()
3691             .kernel_size(3)
3692             .subsampling(2)
3693             .padding_width(1)
3694             .padding_height(1) // padded input height of at least 3 required
3695             .input_channels(3)
3696             .output_channels_tile(4)
3697             .output_channels(output_channels)
3698             .input_width(input_width)
3699             .input_height(input_height)
3700             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3701         }
3702       }
3703     }
3704   }
3705 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,input_height_gt_3)3706   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, input_height_gt_3) {
3707     TEST_REQUIRES_ARM_NEON;
3708     for (size_t input_height = 4; input_height <= 9; input_height++) {
3709       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3710         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3711           ConvHWCMicrokernelTester()
3712             .kernel_size(3)
3713             .subsampling(2)
3714             .padding_width(1)
3715             .input_channels(3)
3716             .output_channels_tile(4)
3717             .output_channels(output_channels)
3718             .input_width(input_width)
3719             .input_height(input_height)
3720             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3721         }
3722       }
3723     }
3724   }
3725 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,padding_top)3726   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, padding_top) {
3727     TEST_REQUIRES_ARM_NEON;
3728     for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
3729       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3730         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3731           ConvHWCMicrokernelTester()
3732             .kernel_size(3)
3733             .subsampling(2)
3734             .padding_width(1)
3735             .padding_top(padding_top)
3736             .input_channels(3)
3737             .output_channels_tile(4)
3738             .output_channels(output_channels)
3739             .input_width(input_width)
3740             .input_height(9)
3741             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3742         }
3743       }
3744     }
3745   }
3746 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,padding_bottom)3747   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, padding_bottom) {
3748     TEST_REQUIRES_ARM_NEON;
3749     for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
3750       for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
3751         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3752           ConvHWCMicrokernelTester()
3753             .kernel_size(3)
3754             .subsampling(2)
3755             .padding_width(1)
3756             .padding_bottom(padding_bottom)
3757             .input_channels(3)
3758             .output_channels_tile(4)
3759             .output_channels(output_channels)
3760             .input_width(input_width)
3761             .input_height(9)
3762             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3763         }
3764       }
3765     }
3766   }
3767 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,output_y_start)3768   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, output_y_start) {
3769     TEST_REQUIRES_ARM_NEON;
3770     for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
3771       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3772         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3773           ConvHWCMicrokernelTester()
3774             .kernel_size(3)
3775             .subsampling(2)
3776             .padding_width(1)
3777             .input_channels(3)
3778             .output_channels_tile(4)
3779             .output_channels(output_channels)
3780             .input_width(input_width)
3781             .input_height(9)
3782             .output_y_start(output_y_start)
3783             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3784         }
3785       }
3786     }
3787   }
3788 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,output_y_end)3789   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, output_y_end) {
3790     TEST_REQUIRES_ARM_NEON;
3791     for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
3792       for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3793         for (size_t input_width = 1; input_width < 16; input_width += 3) {
3794           ConvHWCMicrokernelTester()
3795             .kernel_size(3)
3796             .subsampling(2)
3797             .padding_width(1)
3798             .input_channels(3)
3799             .output_channels_tile(4)
3800             .output_channels(output_channels)
3801             .input_width(input_width)
3802             .input_height(9)
3803             .output_y_end(output_y_end)
3804             .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3805         }
3806       }
3807     }
3808   }
3809 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,qmin)3810   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, qmin) {
3811     TEST_REQUIRES_ARM_NEON;
3812     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3813       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3814         ConvHWCMicrokernelTester()
3815           .kernel_size(3)
3816           .subsampling(2)
3817           .padding_width(1)
3818           .input_channels(3)
3819           .output_channels_tile(4)
3820           .output_channels(output_channels)
3821           .input_width(input_width)
3822           .input_height(6)
3823           .qmin(128)
3824           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3825       }
3826     }
3827   }
3828 
TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1,qmax)3829   TEST(F32_CONV_3X3S2P1C3X4__NEON_2X1, qmax) {
3830     TEST_REQUIRES_ARM_NEON;
3831     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3832       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3833         ConvHWCMicrokernelTester()
3834           .kernel_size(3)
3835           .subsampling(2)
3836           .padding_width(1)
3837           .input_channels(3)
3838           .output_channels_tile(4)
3839           .output_channels(output_channels)
3840           .input_width(input_width)
3841           .input_height(6)
3842           .qmax(128)
3843           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__neon_2x1);
3844       }
3845     }
3846   }
3847 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3848 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,input_width_eq_2)3849 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, input_width_eq_2) {
3850   TEST_REQUIRES_ARM_NEON_FMA;
3851   ConvHWCMicrokernelTester()
3852     .kernel_size(3)
3853     .subsampling(2)
3854     .padding_width(1)
3855     .input_channels(3)
3856     .output_channels_tile(4)
3857     .output_channels(4)
3858     .input_width(2)
3859     .input_height(3)
3860     .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
3861 }
3862 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,input_width_div_2)3863 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, input_width_div_2) {
3864   TEST_REQUIRES_ARM_NEON_FMA;
3865   for (size_t input_width = 4; input_width <= 16; input_width += 6) {
3866     ConvHWCMicrokernelTester()
3867       .kernel_size(3)
3868       .subsampling(2)
3869       .padding_width(1)
3870       .input_channels(3)
3871       .output_channels_tile(4)
3872       .output_channels(4)
3873       .input_width(input_width)
3874       .input_height(3)
3875       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
3876   }
3877 }
3878 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,input_width_lt_2)3879 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, input_width_lt_2) {
3880   TEST_REQUIRES_ARM_NEON_FMA;
3881   for (size_t input_width = 1; input_width < 2; input_width++) {
3882     ConvHWCMicrokernelTester()
3883       .kernel_size(3)
3884       .subsampling(2)
3885       .padding_width(1)
3886       .input_channels(3)
3887       .output_channels_tile(4)
3888       .output_channels(4)
3889       .input_width(input_width)
3890       .input_height(3)
3891       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
3892   }
3893 }
3894 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,input_width_gt_2)3895 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, input_width_gt_2) {
3896   TEST_REQUIRES_ARM_NEON_FMA;
3897   for (size_t input_width = 3; input_width < 4; input_width++) {
3898     ConvHWCMicrokernelTester()
3899       .kernel_size(3)
3900       .subsampling(2)
3901       .padding_width(1)
3902       .input_channels(3)
3903       .output_channels_tile(4)
3904       .output_channels(4)
3905       .input_width(input_width)
3906       .input_height(3)
3907       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
3908   }
3909 }
3910 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,output_channels_lt_4)3911 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, output_channels_lt_4) {
3912   TEST_REQUIRES_ARM_NEON_FMA;
3913   for (size_t output_channels = 1; output_channels < 4; output_channels++) {
3914     for (size_t input_width = 1; input_width < 16; input_width += 3) {
3915       ConvHWCMicrokernelTester()
3916         .kernel_size(3)
3917         .subsampling(2)
3918         .padding_width(1)
3919         .input_channels(3)
3920         .output_channels_tile(4)
3921         .output_channels(output_channels)
3922         .input_width(input_width)
3923         .input_height(3)
3924         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
3925     }
3926   }
3927 }
3928 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,output_channels_div_4)3929 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, output_channels_div_4) {
3930   TEST_REQUIRES_ARM_NEON_FMA;
3931   for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
3932     for (size_t input_width = 1; input_width < 16; input_width += 3) {
3933       ConvHWCMicrokernelTester()
3934         .kernel_size(3)
3935         .subsampling(2)
3936         .padding_width(1)
3937         .input_channels(3)
3938         .output_channels_tile(4)
3939         .output_channels(output_channels)
3940         .input_width(input_width)
3941         .input_height(3)
3942         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
3943     }
3944   }
3945 }
3946 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,output_channels_gt_4)3947 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, output_channels_gt_4) {
3948   TEST_REQUIRES_ARM_NEON_FMA;
3949   for (size_t output_channels = 5; output_channels < 8; output_channels++) {
3950     for (size_t input_width = 1; input_width < 16; input_width += 3) {
3951       ConvHWCMicrokernelTester()
3952         .kernel_size(3)
3953         .subsampling(2)
3954         .padding_width(1)
3955         .input_channels(3)
3956         .output_channels_tile(4)
3957         .output_channels(output_channels)
3958         .input_width(input_width)
3959         .input_height(3)
3960         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
3961     }
3962   }
3963 }
3964 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,input_height_lt_3)3965 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, input_height_lt_3) {
3966   TEST_REQUIRES_ARM_NEON_FMA;
3967   for (size_t input_height = 1; input_height < 3; input_height++) {
3968     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3969       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3970         ConvHWCMicrokernelTester()
3971           .kernel_size(3)
3972           .subsampling(2)
3973           .padding(1)
3974           .input_channels(3) // padded input height of at least 3 required
3975           .output_channels_tile(4)
3976           .output_channels(output_channels)
3977           .input_width(input_width)
3978           .input_height(input_height)
3979           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
3980       }
3981     }
3982   }
3983 }
3984 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,input_height_gt_3)3985 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, input_height_gt_3) {
3986   TEST_REQUIRES_ARM_NEON_FMA;
3987   for (size_t input_height = 4; input_height <= 9; input_height++) {
3988     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
3989       for (size_t input_width = 1; input_width < 16; input_width += 3) {
3990         ConvHWCMicrokernelTester()
3991           .kernel_size(3)
3992           .subsampling(2)
3993           .padding_width(1)
3994           .input_channels(3)
3995           .output_channels_tile(4)
3996           .output_channels(output_channels)
3997           .input_width(input_width)
3998           .input_height(input_height)
3999           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
4000       }
4001     }
4002   }
4003 }
4004 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,padding_top)4005 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, padding_top) {
4006   TEST_REQUIRES_ARM_NEON_FMA;
4007   for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
4008     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
4009       for (size_t input_width = 1; input_width < 16; input_width += 3) {
4010         ConvHWCMicrokernelTester()
4011           .kernel_size(3)
4012           .subsampling(2)
4013           .padding_width(1)
4014           .padding_top(padding_top)
4015           .input_channels(3)
4016           .output_channels_tile(4)
4017           .output_channels(output_channels)
4018           .input_width(input_width)
4019           .input_height(9)
4020           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
4021       }
4022     }
4023   }
4024 }
4025 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,padding_bottom)4026 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, padding_bottom) {
4027   TEST_REQUIRES_ARM_NEON_FMA;
4028   for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
4029     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
4030       for (size_t input_width = 1; input_width < 16; input_width += 3) {
4031         ConvHWCMicrokernelTester()
4032           .kernel_size(3)
4033           .subsampling(2)
4034           .padding_width(1)
4035           .padding_bottom(padding_bottom)
4036           .input_channels(3)
4037           .output_channels_tile(4)
4038           .output_channels(output_channels)
4039           .input_width(input_width)
4040           .input_height(9)
4041           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
4042       }
4043     }
4044   }
4045 }
4046 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,output_y_start)4047 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, output_y_start) {
4048   TEST_REQUIRES_ARM_NEON_FMA;
4049   for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
4050     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4051       for (size_t input_width = 1; input_width < 16; input_width += 3) {
4052         ConvHWCMicrokernelTester()
4053           .kernel_size(3)
4054           .subsampling(2)
4055           .padding_width(1)
4056           .input_channels(3)
4057           .output_channels_tile(4)
4058           .output_channels(output_channels)
4059           .input_width(input_width)
4060           .input_height(9)
4061           .output_y_start(output_y_start)
4062           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
4063       }
4064     }
4065   }
4066 }
4067 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,output_y_end)4068 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, output_y_end) {
4069   TEST_REQUIRES_ARM_NEON_FMA;
4070   for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
4071     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4072       for (size_t input_width = 1; input_width < 16; input_width += 3) {
4073         ConvHWCMicrokernelTester()
4074           .kernel_size(3)
4075           .subsampling(2)
4076           .padding_width(1)
4077           .input_channels(3)
4078           .output_channels_tile(4)
4079           .output_channels(output_channels)
4080           .input_width(input_width)
4081           .input_height(9)
4082           .output_y_end(output_y_end)
4083           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
4084       }
4085     }
4086   }
4087 }
4088 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,qmin)4089 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, qmin) {
4090   TEST_REQUIRES_ARM_NEON_FMA;
4091   for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4092     for (size_t input_width = 1; input_width < 16; input_width += 3) {
4093       ConvHWCMicrokernelTester()
4094         .kernel_size(3)
4095         .subsampling(2)
4096         .padding_width(1)
4097         .input_channels(3)
4098         .output_channels_tile(4)
4099         .output_channels(output_channels)
4100         .input_width(input_width)
4101         .input_height(6)
4102         .qmin(128)
4103         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
4104     }
4105   }
4106 }
4107 
TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1,qmax)4108 TEST(F32_CONV_3X3S2P1C3X4__SCALAR_1X1, qmax) {
4109   TEST_REQUIRES_ARM_NEON_FMA;
4110   for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4111     for (size_t input_width = 1; input_width < 16; input_width += 3) {
4112       ConvHWCMicrokernelTester()
4113         .kernel_size(3)
4114         .subsampling(2)
4115         .padding_width(1)
4116         .input_channels(3)
4117         .output_channels_tile(4)
4118         .output_channels(output_channels)
4119         .input_width(input_width)
4120         .input_height(6)
4121         .qmax(128)
4122         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1);
4123     }
4124   }
4125 }
4126 
4127 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,input_width_eq_2)4128 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, input_width_eq_2) {
4129   TEST_REQUIRES_ARM_NEON_FMA;
4130   ConvHWCMicrokernelTester()
4131     .kernel_size(3)
4132     .subsampling(2)
4133     .padding_right(1)
4134     .input_channels(3)
4135     .output_channels_tile(4)
4136     .output_channels(4)
4137     .input_width(2)
4138     .input_height(3)
4139     .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4140 }
4141 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,input_width_div_2)4142 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, input_width_div_2) {
4143   TEST_REQUIRES_ARM_NEON_FMA;
4144   for (size_t input_width = 4; input_width <= 16; input_width += 6) {
4145     ConvHWCMicrokernelTester()
4146       .kernel_size(3)
4147       .subsampling(2)
4148       .padding_right(1)
4149       .input_channels(3)
4150       .output_channels_tile(4)
4151       .output_channels(4)
4152       .input_width(input_width)
4153       .input_height(3)
4154       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4155   }
4156 }
4157 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,input_width_gt_2)4158 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, input_width_gt_2) {
4159   TEST_REQUIRES_ARM_NEON_FMA;
4160   for (size_t input_width = 3; input_width < 4; input_width++) {
4161     ConvHWCMicrokernelTester()
4162       .kernel_size(3)
4163       .subsampling(2)
4164       .padding_right(1)
4165       .input_channels(3)
4166       .output_channels_tile(4)
4167       .output_channels(4)
4168       .input_width(input_width)
4169       .input_height(3)
4170       .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4171   }
4172 }
4173 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,output_channels_lt_4)4174 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, output_channels_lt_4) {
4175   TEST_REQUIRES_ARM_NEON_FMA;
4176   for (size_t output_channels = 1; output_channels < 4; output_channels++) {
4177     for (size_t input_width = 2; input_width < 16; input_width += 3) {
4178       ConvHWCMicrokernelTester()
4179         .kernel_size(3)
4180         .subsampling(2)
4181         .padding_right(1)
4182         .input_channels(3)
4183         .output_channels_tile(4)
4184         .output_channels(output_channels)
4185         .input_width(input_width)
4186         .input_height(3)
4187         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4188     }
4189   }
4190 }
4191 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,output_channels_div_4)4192 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, output_channels_div_4) {
4193   TEST_REQUIRES_ARM_NEON_FMA;
4194   for (size_t output_channels = 8; output_channels <= 16; output_channels += 4) {
4195     for (size_t input_width = 2; input_width < 16; input_width += 3) {
4196       ConvHWCMicrokernelTester()
4197         .kernel_size(3)
4198         .subsampling(2)
4199         .padding_right(1)
4200         .input_channels(3)
4201         .output_channels_tile(4)
4202         .output_channels(output_channels)
4203         .input_width(input_width)
4204         .input_height(3)
4205         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4206     }
4207   }
4208 }
4209 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,output_channels_gt_4)4210 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, output_channels_gt_4) {
4211   TEST_REQUIRES_ARM_NEON_FMA;
4212   for (size_t output_channels = 5; output_channels < 8; output_channels++) {
4213     for (size_t input_width = 2; input_width < 16; input_width += 3) {
4214       ConvHWCMicrokernelTester()
4215         .kernel_size(3)
4216         .subsampling(2)
4217         .padding_right(1)
4218         .input_channels(3)
4219         .output_channels_tile(4)
4220         .output_channels(output_channels)
4221         .input_width(input_width)
4222         .input_height(3)
4223         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4224     }
4225   }
4226 }
4227 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,input_height_lt_3)4228 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, input_height_lt_3) {
4229   TEST_REQUIRES_ARM_NEON_FMA;
4230   for (size_t input_height = 1; input_height < 3; input_height++) {
4231     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4232       for (size_t input_width = 2; input_width < 16; input_width += 3) {
4233         ConvHWCMicrokernelTester()
4234           .kernel_size(3)
4235           .subsampling(2)
4236           .padding_right(1)
4237           .padding_height(1) // padded input height of at least 3 required
4238           .input_channels(3)
4239           .output_channels_tile(4)
4240           .output_channels(output_channels)
4241           .input_width(input_width)
4242           .input_height(input_height)
4243           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4244       }
4245     }
4246   }
4247 }
4248 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,input_height_gt_3)4249 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, input_height_gt_3) {
4250   TEST_REQUIRES_ARM_NEON_FMA;
4251   for (size_t input_height = 4; input_height <= 9; input_height++) {
4252     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4253       for (size_t input_width = 2; input_width < 16; input_width += 3) {
4254         ConvHWCMicrokernelTester()
4255           .kernel_size(3)
4256           .subsampling(2)
4257           .padding_right(1)
4258           .input_channels(3)
4259           .output_channels_tile(4)
4260           .output_channels(output_channels)
4261           .input_width(input_width)
4262           .input_height(input_height)
4263           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4264       }
4265     }
4266   }
4267 }
4268 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,padding_top)4269 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, padding_top) {
4270   TEST_REQUIRES_ARM_NEON_FMA;
4271   for (size_t padding_top = 0; padding_top <= 1; padding_top++) {
4272     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
4273       for (size_t input_width = 2; input_width < 16; input_width += 3) {
4274         ConvHWCMicrokernelTester()
4275           .kernel_size(3)
4276           .subsampling(2)
4277           .padding_right(1)
4278           .padding_top(padding_top)
4279           .input_channels(3)
4280           .output_channels_tile(4)
4281           .output_channels(output_channels)
4282           .input_width(input_width)
4283           .input_height(9)
4284           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4285       }
4286     }
4287   }
4288 }
4289 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,padding_bottom)4290 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, padding_bottom) {
4291   TEST_REQUIRES_ARM_NEON_FMA;
4292   for (size_t padding_bottom = 0; padding_bottom <= 1; padding_bottom++) {
4293     for (size_t output_channels = 1; output_channels < 16; output_channels += 7) {
4294       for (size_t input_width = 2; input_width < 16; input_width += 3) {
4295         ConvHWCMicrokernelTester()
4296           .kernel_size(3)
4297           .subsampling(2)
4298           .padding_right(1)
4299           .padding_bottom(padding_bottom)
4300           .input_channels(3)
4301           .output_channels_tile(4)
4302           .output_channels(output_channels)
4303           .input_width(input_width)
4304           .input_height(9)
4305           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4306       }
4307     }
4308   }
4309 }
4310 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,output_y_start)4311 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, output_y_start) {
4312   TEST_REQUIRES_ARM_NEON_FMA;
4313   for (size_t output_y_start = 1; output_y_start <= 3; output_y_start++) {
4314     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4315       for (size_t input_width = 2; input_width < 16; input_width += 3) {
4316         ConvHWCMicrokernelTester()
4317           .kernel_size(3)
4318           .subsampling(2)
4319           .padding_right(1)
4320           .input_channels(3)
4321           .output_channels_tile(4)
4322           .output_channels(output_channels)
4323           .input_width(input_width)
4324           .input_height(9)
4325           .output_y_start(output_y_start)
4326           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4327       }
4328     }
4329   }
4330 }
4331 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,output_y_end)4332 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, output_y_end) {
4333   TEST_REQUIRES_ARM_NEON_FMA;
4334   for (size_t output_y_end = 2; output_y_end < 5; output_y_end++) {
4335     for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4336       for (size_t input_width = 2; input_width < 16; input_width += 3) {
4337         ConvHWCMicrokernelTester()
4338           .kernel_size(3)
4339           .subsampling(2)
4340           .padding_right(1)
4341           .input_channels(3)
4342           .output_channels_tile(4)
4343           .output_channels(output_channels)
4344           .input_width(input_width)
4345           .input_height(9)
4346           .output_y_end(output_y_end)
4347           .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4348       }
4349     }
4350   }
4351 }
4352 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,qmin)4353 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, qmin) {
4354   TEST_REQUIRES_ARM_NEON_FMA;
4355   for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4356     for (size_t input_width = 2; input_width < 16; input_width += 3) {
4357       ConvHWCMicrokernelTester()
4358         .kernel_size(3)
4359         .subsampling(2)
4360         .padding_right(1)
4361         .input_channels(3)
4362         .output_channels_tile(4)
4363         .output_channels(output_channels)
4364         .input_width(input_width)
4365         .input_height(6)
4366         .qmin(128)
4367         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4368     }
4369   }
4370 }
4371 
TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1,qmax)4372 TEST(F32_CONV_3X3S2P0P1C3X4__SCALAR_1X1, qmax) {
4373   TEST_REQUIRES_ARM_NEON_FMA;
4374   for (size_t output_channels = 1; output_channels < 8; output_channels += 3) {
4375     for (size_t input_width = 2; input_width < 16; input_width += 3) {
4376       ConvHWCMicrokernelTester()
4377         .kernel_size(3)
4378         .subsampling(2)
4379         .padding_right(1)
4380         .input_channels(3)
4381         .output_channels_tile(4)
4382         .output_channels(output_channels)
4383         .input_width(input_width)
4384         .input_height(6)
4385         .qmax(128)
4386         .Test(xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1);
4387     }
4388   }
4389 }
4390