1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 //
6 // Auto-generated file. Do not edit!
7 // Specification: test/x64-transpose.yaml
8 // Generator: tools/generate-transpose-test.py
9
10
11 #include <gtest/gtest.h>
12
13 #include <xnnpack/common.h>
14 #include <xnnpack/isa-checks.h>
15
16 #include <xnnpack/transpose.h>
17 #include "transpose-microkernel-tester.h"
18
19
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_1_bw_2)20 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_1_bw_2) {
21 TransposeMicrokernelTester()
22 .input_stride(2)
23 .output_stride(1)
24 .block_width(2)
25 .block_height(1)
26 .iterations(1)
27 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
28 }
29
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_1_2_bw_1_4)30 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_1_2_bw_1_4) {
31 for(size_t i = 1; i <= 2; ++i){
32 for(size_t j = 1; j <= 4; ++j){
33 TransposeMicrokernelTester()
34 .input_stride(j)
35 .output_stride(i)
36 .block_width(j)
37 .block_height(i)
38 .iterations(1)
39 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
40 }
41 }
42 }
43
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_1_bw_4)44 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_1_bw_4) {
45 TransposeMicrokernelTester()
46 .input_stride(4)
47 .output_stride(1)
48 .block_width(4)
49 .block_height(1)
50 .iterations(1)
51 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
52 }
53
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_1_bw_3_4)54 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_1_bw_3_4) {
55 for(size_t i = 3; i < 4; ++i){
56 TransposeMicrokernelTester()
57 .input_stride(i)
58 .output_stride(1)
59 .block_width(i)
60 .block_height(1)
61 .iterations(1)
62 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
63 }
64 }
65
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_2_bw_3_4)66 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_2_bw_3_4) {
67 for(size_t i = 3; i < 4; ++i){
68 TransposeMicrokernelTester()
69 .input_stride(i)
70 .output_stride(2)
71 .block_width(i)
72 .block_height(2)
73 .iterations(1)
74 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
75 }
76 }
77
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_2_bw_2)78 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_2_bw_2) {
79 TransposeMicrokernelTester()
80 .input_stride(2)
81 .output_stride(2)
82 .block_width(2)
83 .block_height(2)
84 .iterations(1)
85 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
86 }
87
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_2_2_bw_2)88 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_2_2_bw_2){
89 for(size_t i = 2; i < 2; ++i){
90 TransposeMicrokernelTester()
91 .input_stride(2)
92 .output_stride(i)
93 .block_width(2)
94 .block_height(i)
95 .iterations(1)
96 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
97 }
98 }
99
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_2_2_bw_4)100 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_2_2_bw_4){
101 for(size_t i = 2; i < 2; ++i){
102 TransposeMicrokernelTester()
103 .input_stride(4)
104 .output_stride(i)
105 .block_width(4)
106 .block_height(i)
107 .iterations(1)
108 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
109 }
110 }
111
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_2_2_bw_3_4)112 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_2_2_bw_3_4) {
113 for(size_t i = 2; i < 2; ++i){
114 for(size_t j = 3; j < 4; ++j){
115 TransposeMicrokernelTester()
116 .input_stride(j)
117 .output_stride(i)
118 .block_width(j)
119 .block_height(i)
120 .iterations(1)
121 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
122 }
123 }
124 }
125
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_1_bw_2_is_4)126 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_1_bw_2_is_4) {
127 TransposeMicrokernelTester()
128 .input_stride(4)
129 .output_stride(1)
130 .block_width(2)
131 .block_height(1)
132 .iterations(1)
133 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
134 }
135
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_1_bw_2_os_2)136 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_1_bw_2_os_2) {
137 TransposeMicrokernelTester()
138 .input_stride(2)
139 .output_stride(2)
140 .block_width(2)
141 .block_height(1)
142 .iterations(1)
143 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
144 }
145
TEST(X64_TRANSPOSE__1X2_SCALAR_INT,bh_1_bw_2_is_4_os_2)146 TEST(X64_TRANSPOSE__1X2_SCALAR_INT, bh_1_bw_2_is_4_os_2) {
147 TransposeMicrokernelTester()
148 .input_stride(4)
149 .output_stride(2)
150 .block_width(2)
151 .block_height(1)
152 .iterations(1)
153 .Test(xnn_x64_transpose_ukernel__1x2_scalar_int);
154 }
155
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_2_bw_1)156 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_2_bw_1) {
157 TransposeMicrokernelTester()
158 .input_stride(1)
159 .output_stride(2)
160 .block_width(1)
161 .block_height(2)
162 .iterations(1)
163 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
164 }
165
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_1_4_bw_1_2)166 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_1_4_bw_1_2) {
167 for(size_t i = 1; i <= 4; ++i){
168 for(size_t j = 1; j <= 2; ++j){
169 TransposeMicrokernelTester()
170 .input_stride(j)
171 .output_stride(i)
172 .block_width(j)
173 .block_height(i)
174 .iterations(1)
175 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
176 }
177 }
178 }
179
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_2_bw_2)180 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_2_bw_2) {
181 TransposeMicrokernelTester()
182 .input_stride(2)
183 .output_stride(2)
184 .block_width(2)
185 .block_height(2)
186 .iterations(1)
187 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
188 }
189
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_2_bw_2_2)190 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_2_bw_2_2) {
191 for(size_t i = 2; i < 2; ++i){
192 TransposeMicrokernelTester()
193 .input_stride(i)
194 .output_stride(2)
195 .block_width(i)
196 .block_height(2)
197 .iterations(1)
198 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
199 }
200 }
201
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_4_bw_2_2)202 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_4_bw_2_2) {
203 for(size_t i = 2; i < 2; ++i){
204 TransposeMicrokernelTester()
205 .input_stride(i)
206 .output_stride(4)
207 .block_width(i)
208 .block_height(4)
209 .iterations(1)
210 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
211 }
212 }
213
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_4_bw_1)214 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_4_bw_1) {
215 TransposeMicrokernelTester()
216 .input_stride(1)
217 .output_stride(4)
218 .block_width(1)
219 .block_height(4)
220 .iterations(1)
221 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
222 }
223
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_3_4_bw_1)224 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_3_4_bw_1){
225 for(size_t i = 3; i < 4; ++i){
226 TransposeMicrokernelTester()
227 .input_stride(1)
228 .output_stride(i)
229 .block_width(1)
230 .block_height(i)
231 .iterations(1)
232 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
233 }
234 }
235
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_3_4_bw_2)236 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_3_4_bw_2){
237 for(size_t i = 3; i < 4; ++i){
238 TransposeMicrokernelTester()
239 .input_stride(2)
240 .output_stride(i)
241 .block_width(2)
242 .block_height(i)
243 .iterations(1)
244 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
245 }
246 }
247
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_3_4_bw_2_2)248 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_3_4_bw_2_2) {
249 for(size_t i = 3; i < 4; ++i){
250 for(size_t j = 2; j < 2; ++j){
251 TransposeMicrokernelTester()
252 .input_stride(j)
253 .output_stride(i)
254 .block_width(j)
255 .block_height(i)
256 .iterations(1)
257 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
258 }
259 }
260 }
261
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_2_bw_1_is_2)262 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_2_bw_1_is_2) {
263 TransposeMicrokernelTester()
264 .input_stride(2)
265 .output_stride(2)
266 .block_width(1)
267 .block_height(2)
268 .iterations(1)
269 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
270 }
271
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_2_bw_1_os_4)272 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_2_bw_1_os_4) {
273 TransposeMicrokernelTester()
274 .input_stride(1)
275 .output_stride(4)
276 .block_width(1)
277 .block_height(2)
278 .iterations(1)
279 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
280 }
281
TEST(X64_TRANSPOSE__2X1_SCALAR_INT,bh_2_bw_1_is_2_os_4)282 TEST(X64_TRANSPOSE__2X1_SCALAR_INT, bh_2_bw_1_is_2_os_4) {
283 TransposeMicrokernelTester()
284 .input_stride(2)
285 .output_stride(4)
286 .block_width(1)
287 .block_height(2)
288 .iterations(1)
289 .Test(xnn_x64_transpose_ukernel__2x1_scalar_int);
290 }
291
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_2_bw_2)292 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_2_bw_2) {
293 TransposeMicrokernelTester()
294 .input_stride(2)
295 .output_stride(2)
296 .block_width(2)
297 .block_height(2)
298 .iterations(1)
299 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
300 }
301
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_1_4_bw_1_4)302 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_1_4_bw_1_4) {
303 for(size_t i = 1; i <= 4; ++i){
304 for(size_t j = 1; j <= 4; ++j){
305 TransposeMicrokernelTester()
306 .input_stride(j)
307 .output_stride(i)
308 .block_width(j)
309 .block_height(i)
310 .iterations(1)
311 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
312 }
313 }
314 }
315
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_2_bw_4)316 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_2_bw_4) {
317 TransposeMicrokernelTester()
318 .input_stride(4)
319 .output_stride(2)
320 .block_width(4)
321 .block_height(2)
322 .iterations(1)
323 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
324 }
325
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_2_bw_3_4)326 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_2_bw_3_4) {
327 for(size_t i = 3; i < 4; ++i){
328 TransposeMicrokernelTester()
329 .input_stride(i)
330 .output_stride(2)
331 .block_width(i)
332 .block_height(2)
333 .iterations(1)
334 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
335 }
336 }
337
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_4_bw_3_4)338 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_4_bw_3_4) {
339 for(size_t i = 3; i < 4; ++i){
340 TransposeMicrokernelTester()
341 .input_stride(i)
342 .output_stride(4)
343 .block_width(i)
344 .block_height(4)
345 .iterations(1)
346 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
347 }
348 }
349
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_4_bw_2)350 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_4_bw_2) {
351 TransposeMicrokernelTester()
352 .input_stride(2)
353 .output_stride(4)
354 .block_width(2)
355 .block_height(4)
356 .iterations(1)
357 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
358 }
359
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_3_4_bw_2)360 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_3_4_bw_2){
361 for(size_t i = 3; i < 4; ++i){
362 TransposeMicrokernelTester()
363 .input_stride(2)
364 .output_stride(i)
365 .block_width(2)
366 .block_height(i)
367 .iterations(1)
368 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
369 }
370 }
371
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_3_4_bw_4)372 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_3_4_bw_4){
373 for(size_t i = 3; i < 4; ++i){
374 TransposeMicrokernelTester()
375 .input_stride(4)
376 .output_stride(i)
377 .block_width(4)
378 .block_height(i)
379 .iterations(1)
380 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
381 }
382 }
383
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_3_4_bw_3_4)384 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_3_4_bw_3_4) {
385 for(size_t i = 3; i < 4; ++i){
386 for(size_t j = 3; j < 4; ++j){
387 TransposeMicrokernelTester()
388 .input_stride(j)
389 .output_stride(i)
390 .block_width(j)
391 .block_height(i)
392 .iterations(1)
393 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
394 }
395 }
396 }
397
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_2_bw_2_is_4)398 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_2_bw_2_is_4) {
399 TransposeMicrokernelTester()
400 .input_stride(4)
401 .output_stride(2)
402 .block_width(2)
403 .block_height(2)
404 .iterations(1)
405 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
406 }
407
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_2_bw_2_os_4)408 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_2_bw_2_os_4) {
409 TransposeMicrokernelTester()
410 .input_stride(2)
411 .output_stride(4)
412 .block_width(2)
413 .block_height(2)
414 .iterations(1)
415 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
416 }
417
TEST(X64_TRANSPOSE__2X2_SCALAR_INT,bh_2_bw_2_is_4_os_4)418 TEST(X64_TRANSPOSE__2X2_SCALAR_INT, bh_2_bw_2_is_4_os_4) {
419 TransposeMicrokernelTester()
420 .input_stride(4)
421 .output_stride(4)
422 .block_width(2)
423 .block_height(2)
424 .iterations(1)
425 .Test(xnn_x64_transpose_ukernel__2x2_scalar_int);
426 }
427
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_4_bw_1)428 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_4_bw_1) {
429 TransposeMicrokernelTester()
430 .input_stride(1)
431 .output_stride(4)
432 .block_width(1)
433 .block_height(4)
434 .iterations(1)
435 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
436 }
437
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_1_8_bw_1_2)438 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_1_8_bw_1_2) {
439 for(size_t i = 1; i <= 8; ++i){
440 for(size_t j = 1; j <= 2; ++j){
441 TransposeMicrokernelTester()
442 .input_stride(j)
443 .output_stride(i)
444 .block_width(j)
445 .block_height(i)
446 .iterations(1)
447 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
448 }
449 }
450 }
451
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_4_bw_2)452 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_4_bw_2) {
453 TransposeMicrokernelTester()
454 .input_stride(2)
455 .output_stride(4)
456 .block_width(2)
457 .block_height(4)
458 .iterations(1)
459 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
460 }
461
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_4_bw_2_2)462 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_4_bw_2_2) {
463 for(size_t i = 2; i < 2; ++i){
464 TransposeMicrokernelTester()
465 .input_stride(i)
466 .output_stride(4)
467 .block_width(i)
468 .block_height(4)
469 .iterations(1)
470 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
471 }
472 }
473
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_8_bw_2_2)474 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_8_bw_2_2) {
475 for(size_t i = 2; i < 2; ++i){
476 TransposeMicrokernelTester()
477 .input_stride(i)
478 .output_stride(8)
479 .block_width(i)
480 .block_height(8)
481 .iterations(1)
482 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
483 }
484 }
485
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_8_bw_1)486 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_8_bw_1) {
487 TransposeMicrokernelTester()
488 .input_stride(1)
489 .output_stride(8)
490 .block_width(1)
491 .block_height(8)
492 .iterations(1)
493 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
494 }
495
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_5_8_bw_1)496 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_5_8_bw_1){
497 for(size_t i = 5; i < 8; ++i){
498 TransposeMicrokernelTester()
499 .input_stride(1)
500 .output_stride(i)
501 .block_width(1)
502 .block_height(i)
503 .iterations(1)
504 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
505 }
506 }
507
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_5_8_bw_2)508 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_5_8_bw_2){
509 for(size_t i = 5; i < 8; ++i){
510 TransposeMicrokernelTester()
511 .input_stride(2)
512 .output_stride(i)
513 .block_width(2)
514 .block_height(i)
515 .iterations(1)
516 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
517 }
518 }
519
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_5_8_bw_2_2)520 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_5_8_bw_2_2) {
521 for(size_t i = 5; i < 8; ++i){
522 for(size_t j = 2; j < 2; ++j){
523 TransposeMicrokernelTester()
524 .input_stride(j)
525 .output_stride(i)
526 .block_width(j)
527 .block_height(i)
528 .iterations(1)
529 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
530 }
531 }
532 }
533
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_4_bw_1_is_2)534 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_4_bw_1_is_2) {
535 TransposeMicrokernelTester()
536 .input_stride(2)
537 .output_stride(4)
538 .block_width(1)
539 .block_height(4)
540 .iterations(1)
541 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
542 }
543
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_4_bw_1_os_8)544 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_4_bw_1_os_8) {
545 TransposeMicrokernelTester()
546 .input_stride(1)
547 .output_stride(8)
548 .block_width(1)
549 .block_height(4)
550 .iterations(1)
551 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
552 }
553
TEST(X64_TRANSPOSE__4X1_SCALAR_INT,bh_4_bw_1_is_2_os_8)554 TEST(X64_TRANSPOSE__4X1_SCALAR_INT, bh_4_bw_1_is_2_os_8) {
555 TransposeMicrokernelTester()
556 .input_stride(2)
557 .output_stride(8)
558 .block_width(1)
559 .block_height(4)
560 .iterations(1)
561 .Test(xnn_x64_transpose_ukernel__4x1_scalar_int);
562 }
563
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_4_bw_2)564 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_4_bw_2) {
565 TransposeMicrokernelTester()
566 .input_stride(2)
567 .output_stride(4)
568 .block_width(2)
569 .block_height(4)
570 .iterations(1)
571 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
572 }
573
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_1_8_bw_1_4)574 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_1_8_bw_1_4) {
575 for(size_t i = 1; i <= 8; ++i){
576 for(size_t j = 1; j <= 4; ++j){
577 TransposeMicrokernelTester()
578 .input_stride(j)
579 .output_stride(i)
580 .block_width(j)
581 .block_height(i)
582 .iterations(1)
583 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
584 }
585 }
586 }
587
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_4_bw_4)588 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_4_bw_4) {
589 TransposeMicrokernelTester()
590 .input_stride(4)
591 .output_stride(4)
592 .block_width(4)
593 .block_height(4)
594 .iterations(1)
595 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
596 }
597
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_4_bw_3_4)598 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_4_bw_3_4) {
599 for(size_t i = 3; i < 4; ++i){
600 TransposeMicrokernelTester()
601 .input_stride(i)
602 .output_stride(4)
603 .block_width(i)
604 .block_height(4)
605 .iterations(1)
606 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
607 }
608 }
609
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_8_bw_3_4)610 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_8_bw_3_4) {
611 for(size_t i = 3; i < 4; ++i){
612 TransposeMicrokernelTester()
613 .input_stride(i)
614 .output_stride(8)
615 .block_width(i)
616 .block_height(8)
617 .iterations(1)
618 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
619 }
620 }
621
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_8_bw_2)622 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_8_bw_2) {
623 TransposeMicrokernelTester()
624 .input_stride(2)
625 .output_stride(8)
626 .block_width(2)
627 .block_height(8)
628 .iterations(1)
629 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
630 }
631
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_5_8_bw_2)632 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_5_8_bw_2){
633 for(size_t i = 5; i < 8; ++i){
634 TransposeMicrokernelTester()
635 .input_stride(2)
636 .output_stride(i)
637 .block_width(2)
638 .block_height(i)
639 .iterations(1)
640 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
641 }
642 }
643
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_5_8_bw_4)644 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_5_8_bw_4){
645 for(size_t i = 5; i < 8; ++i){
646 TransposeMicrokernelTester()
647 .input_stride(4)
648 .output_stride(i)
649 .block_width(4)
650 .block_height(i)
651 .iterations(1)
652 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
653 }
654 }
655
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_5_8_bw_3_4)656 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_5_8_bw_3_4) {
657 for(size_t i = 5; i < 8; ++i){
658 for(size_t j = 3; j < 4; ++j){
659 TransposeMicrokernelTester()
660 .input_stride(j)
661 .output_stride(i)
662 .block_width(j)
663 .block_height(i)
664 .iterations(1)
665 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
666 }
667 }
668 }
669
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_4_bw_2_is_4)670 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_4_bw_2_is_4) {
671 TransposeMicrokernelTester()
672 .input_stride(4)
673 .output_stride(4)
674 .block_width(2)
675 .block_height(4)
676 .iterations(1)
677 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
678 }
679
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_4_bw_2_os_8)680 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_4_bw_2_os_8) {
681 TransposeMicrokernelTester()
682 .input_stride(2)
683 .output_stride(8)
684 .block_width(2)
685 .block_height(4)
686 .iterations(1)
687 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
688 }
689
TEST(X64_TRANSPOSE__4X2_SCALAR_INT,bh_4_bw_2_is_4_os_8)690 TEST(X64_TRANSPOSE__4X2_SCALAR_INT, bh_4_bw_2_is_4_os_8) {
691 TransposeMicrokernelTester()
692 .input_stride(4)
693 .output_stride(8)
694 .block_width(2)
695 .block_height(4)
696 .iterations(1)
697 .Test(xnn_x64_transpose_ukernel__4x2_scalar_int);
698 }
699
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_1_bw_2)700 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_1_bw_2) {
701 TransposeMicrokernelTester()
702 .input_stride(2)
703 .output_stride(1)
704 .block_width(2)
705 .block_height(1)
706 .iterations(1)
707 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
708 }
709
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_1_2_bw_1_4)710 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_1_2_bw_1_4) {
711 for(size_t i = 1; i <= 2; ++i){
712 for(size_t j = 1; j <= 4; ++j){
713 TransposeMicrokernelTester()
714 .input_stride(j)
715 .output_stride(i)
716 .block_width(j)
717 .block_height(i)
718 .iterations(1)
719 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
720 }
721 }
722 }
723
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_1_bw_4)724 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_1_bw_4) {
725 TransposeMicrokernelTester()
726 .input_stride(4)
727 .output_stride(1)
728 .block_width(4)
729 .block_height(1)
730 .iterations(1)
731 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
732 }
733
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_1_bw_3_4)734 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_1_bw_3_4) {
735 for(size_t i = 3; i < 4; ++i){
736 TransposeMicrokernelTester()
737 .input_stride(i)
738 .output_stride(1)
739 .block_width(i)
740 .block_height(1)
741 .iterations(1)
742 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
743 }
744 }
745
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_2_bw_3_4)746 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_2_bw_3_4) {
747 for(size_t i = 3; i < 4; ++i){
748 TransposeMicrokernelTester()
749 .input_stride(i)
750 .output_stride(2)
751 .block_width(i)
752 .block_height(2)
753 .iterations(1)
754 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
755 }
756 }
757
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_2_bw_2)758 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_2_bw_2) {
759 TransposeMicrokernelTester()
760 .input_stride(2)
761 .output_stride(2)
762 .block_width(2)
763 .block_height(2)
764 .iterations(1)
765 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
766 }
767
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_2_2_bw_2)768 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_2_2_bw_2){
769 for(size_t i = 2; i < 2; ++i){
770 TransposeMicrokernelTester()
771 .input_stride(2)
772 .output_stride(i)
773 .block_width(2)
774 .block_height(i)
775 .iterations(1)
776 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
777 }
778 }
779
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_2_2_bw_4)780 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_2_2_bw_4){
781 for(size_t i = 2; i < 2; ++i){
782 TransposeMicrokernelTester()
783 .input_stride(4)
784 .output_stride(i)
785 .block_width(4)
786 .block_height(i)
787 .iterations(1)
788 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
789 }
790 }
791
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_2_2_bw_3_4)792 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_2_2_bw_3_4) {
793 for(size_t i = 2; i < 2; ++i){
794 for(size_t j = 3; j < 4; ++j){
795 TransposeMicrokernelTester()
796 .input_stride(j)
797 .output_stride(i)
798 .block_width(j)
799 .block_height(i)
800 .iterations(1)
801 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
802 }
803 }
804 }
805
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_1_bw_2_is_4)806 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_1_bw_2_is_4) {
807 TransposeMicrokernelTester()
808 .input_stride(4)
809 .output_stride(1)
810 .block_width(2)
811 .block_height(1)
812 .iterations(1)
813 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
814 }
815
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_1_bw_2_os_2)816 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_1_bw_2_os_2) {
817 TransposeMicrokernelTester()
818 .input_stride(2)
819 .output_stride(2)
820 .block_width(2)
821 .block_height(1)
822 .iterations(1)
823 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
824 }
825
TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT,bh_1_bw_2_is_4_os_2)826 TEST(X64_TRANSPOSE__1X2_SCALAR_FLOAT, bh_1_bw_2_is_4_os_2) {
827 TransposeMicrokernelTester()
828 .input_stride(4)
829 .output_stride(2)
830 .block_width(2)
831 .block_height(1)
832 .iterations(1)
833 .Test(xnn_x64_transpose_ukernel__1x2_scalar_float);
834 }
835
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_2_bw_1)836 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_2_bw_1) {
837 TransposeMicrokernelTester()
838 .input_stride(1)
839 .output_stride(2)
840 .block_width(1)
841 .block_height(2)
842 .iterations(1)
843 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
844 }
845
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_1_4_bw_1_2)846 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_1_4_bw_1_2) {
847 for(size_t i = 1; i <= 4; ++i){
848 for(size_t j = 1; j <= 2; ++j){
849 TransposeMicrokernelTester()
850 .input_stride(j)
851 .output_stride(i)
852 .block_width(j)
853 .block_height(i)
854 .iterations(1)
855 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
856 }
857 }
858 }
859
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_2_bw_2)860 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_2_bw_2) {
861 TransposeMicrokernelTester()
862 .input_stride(2)
863 .output_stride(2)
864 .block_width(2)
865 .block_height(2)
866 .iterations(1)
867 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
868 }
869
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_2_bw_2_2)870 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_2_bw_2_2) {
871 for(size_t i = 2; i < 2; ++i){
872 TransposeMicrokernelTester()
873 .input_stride(i)
874 .output_stride(2)
875 .block_width(i)
876 .block_height(2)
877 .iterations(1)
878 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
879 }
880 }
881
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_4_bw_2_2)882 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_4_bw_2_2) {
883 for(size_t i = 2; i < 2; ++i){
884 TransposeMicrokernelTester()
885 .input_stride(i)
886 .output_stride(4)
887 .block_width(i)
888 .block_height(4)
889 .iterations(1)
890 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
891 }
892 }
893
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_4_bw_1)894 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_4_bw_1) {
895 TransposeMicrokernelTester()
896 .input_stride(1)
897 .output_stride(4)
898 .block_width(1)
899 .block_height(4)
900 .iterations(1)
901 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
902 }
903
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_3_4_bw_1)904 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_3_4_bw_1){
905 for(size_t i = 3; i < 4; ++i){
906 TransposeMicrokernelTester()
907 .input_stride(1)
908 .output_stride(i)
909 .block_width(1)
910 .block_height(i)
911 .iterations(1)
912 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
913 }
914 }
915
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_3_4_bw_2)916 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_3_4_bw_2){
917 for(size_t i = 3; i < 4; ++i){
918 TransposeMicrokernelTester()
919 .input_stride(2)
920 .output_stride(i)
921 .block_width(2)
922 .block_height(i)
923 .iterations(1)
924 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
925 }
926 }
927
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_3_4_bw_2_2)928 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_3_4_bw_2_2) {
929 for(size_t i = 3; i < 4; ++i){
930 for(size_t j = 2; j < 2; ++j){
931 TransposeMicrokernelTester()
932 .input_stride(j)
933 .output_stride(i)
934 .block_width(j)
935 .block_height(i)
936 .iterations(1)
937 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
938 }
939 }
940 }
941
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_2_bw_1_is_2)942 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_2_bw_1_is_2) {
943 TransposeMicrokernelTester()
944 .input_stride(2)
945 .output_stride(2)
946 .block_width(1)
947 .block_height(2)
948 .iterations(1)
949 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
950 }
951
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_2_bw_1_os_4)952 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_2_bw_1_os_4) {
953 TransposeMicrokernelTester()
954 .input_stride(1)
955 .output_stride(4)
956 .block_width(1)
957 .block_height(2)
958 .iterations(1)
959 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
960 }
961
TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT,bh_2_bw_1_is_2_os_4)962 TEST(X64_TRANSPOSE__2X1_SCALAR_FLOAT, bh_2_bw_1_is_2_os_4) {
963 TransposeMicrokernelTester()
964 .input_stride(2)
965 .output_stride(4)
966 .block_width(1)
967 .block_height(2)
968 .iterations(1)
969 .Test(xnn_x64_transpose_ukernel__2x1_scalar_float);
970 }
971
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_2_bw_2)972 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_2_bw_2) {
973 TransposeMicrokernelTester()
974 .input_stride(2)
975 .output_stride(2)
976 .block_width(2)
977 .block_height(2)
978 .iterations(1)
979 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
980 }
981
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_1_4_bw_1_4)982 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_1_4_bw_1_4) {
983 for(size_t i = 1; i <= 4; ++i){
984 for(size_t j = 1; j <= 4; ++j){
985 TransposeMicrokernelTester()
986 .input_stride(j)
987 .output_stride(i)
988 .block_width(j)
989 .block_height(i)
990 .iterations(1)
991 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
992 }
993 }
994 }
995
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_2_bw_4)996 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_2_bw_4) {
997 TransposeMicrokernelTester()
998 .input_stride(4)
999 .output_stride(2)
1000 .block_width(4)
1001 .block_height(2)
1002 .iterations(1)
1003 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1004 }
1005
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_2_bw_3_4)1006 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_2_bw_3_4) {
1007 for(size_t i = 3; i < 4; ++i){
1008 TransposeMicrokernelTester()
1009 .input_stride(i)
1010 .output_stride(2)
1011 .block_width(i)
1012 .block_height(2)
1013 .iterations(1)
1014 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1015 }
1016 }
1017
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_4_bw_3_4)1018 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_4_bw_3_4) {
1019 for(size_t i = 3; i < 4; ++i){
1020 TransposeMicrokernelTester()
1021 .input_stride(i)
1022 .output_stride(4)
1023 .block_width(i)
1024 .block_height(4)
1025 .iterations(1)
1026 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1027 }
1028 }
1029
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_4_bw_2)1030 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_4_bw_2) {
1031 TransposeMicrokernelTester()
1032 .input_stride(2)
1033 .output_stride(4)
1034 .block_width(2)
1035 .block_height(4)
1036 .iterations(1)
1037 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1038 }
1039
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_3_4_bw_2)1040 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_3_4_bw_2){
1041 for(size_t i = 3; i < 4; ++i){
1042 TransposeMicrokernelTester()
1043 .input_stride(2)
1044 .output_stride(i)
1045 .block_width(2)
1046 .block_height(i)
1047 .iterations(1)
1048 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1049 }
1050 }
1051
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_3_4_bw_4)1052 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_3_4_bw_4){
1053 for(size_t i = 3; i < 4; ++i){
1054 TransposeMicrokernelTester()
1055 .input_stride(4)
1056 .output_stride(i)
1057 .block_width(4)
1058 .block_height(i)
1059 .iterations(1)
1060 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1061 }
1062 }
1063
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_3_4_bw_3_4)1064 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_3_4_bw_3_4) {
1065 for(size_t i = 3; i < 4; ++i){
1066 for(size_t j = 3; j < 4; ++j){
1067 TransposeMicrokernelTester()
1068 .input_stride(j)
1069 .output_stride(i)
1070 .block_width(j)
1071 .block_height(i)
1072 .iterations(1)
1073 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1074 }
1075 }
1076 }
1077
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_2_bw_2_is_4)1078 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_2_bw_2_is_4) {
1079 TransposeMicrokernelTester()
1080 .input_stride(4)
1081 .output_stride(2)
1082 .block_width(2)
1083 .block_height(2)
1084 .iterations(1)
1085 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1086 }
1087
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_2_bw_2_os_4)1088 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_2_bw_2_os_4) {
1089 TransposeMicrokernelTester()
1090 .input_stride(2)
1091 .output_stride(4)
1092 .block_width(2)
1093 .block_height(2)
1094 .iterations(1)
1095 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1096 }
1097
TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT,bh_2_bw_2_is_4_os_4)1098 TEST(X64_TRANSPOSE__2X2_SCALAR_FLOAT, bh_2_bw_2_is_4_os_4) {
1099 TransposeMicrokernelTester()
1100 .input_stride(4)
1101 .output_stride(4)
1102 .block_width(2)
1103 .block_height(2)
1104 .iterations(1)
1105 .Test(xnn_x64_transpose_ukernel__2x2_scalar_float);
1106 }
1107
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_4_bw_1)1108 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_4_bw_1) {
1109 TransposeMicrokernelTester()
1110 .input_stride(1)
1111 .output_stride(4)
1112 .block_width(1)
1113 .block_height(4)
1114 .iterations(1)
1115 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1116 }
1117
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_1_8_bw_1_2)1118 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_1_8_bw_1_2) {
1119 for(size_t i = 1; i <= 8; ++i){
1120 for(size_t j = 1; j <= 2; ++j){
1121 TransposeMicrokernelTester()
1122 .input_stride(j)
1123 .output_stride(i)
1124 .block_width(j)
1125 .block_height(i)
1126 .iterations(1)
1127 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1128 }
1129 }
1130 }
1131
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_4_bw_2)1132 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_4_bw_2) {
1133 TransposeMicrokernelTester()
1134 .input_stride(2)
1135 .output_stride(4)
1136 .block_width(2)
1137 .block_height(4)
1138 .iterations(1)
1139 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1140 }
1141
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_4_bw_2_2)1142 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_4_bw_2_2) {
1143 for(size_t i = 2; i < 2; ++i){
1144 TransposeMicrokernelTester()
1145 .input_stride(i)
1146 .output_stride(4)
1147 .block_width(i)
1148 .block_height(4)
1149 .iterations(1)
1150 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1151 }
1152 }
1153
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_8_bw_2_2)1154 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_8_bw_2_2) {
1155 for(size_t i = 2; i < 2; ++i){
1156 TransposeMicrokernelTester()
1157 .input_stride(i)
1158 .output_stride(8)
1159 .block_width(i)
1160 .block_height(8)
1161 .iterations(1)
1162 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1163 }
1164 }
1165
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_8_bw_1)1166 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_8_bw_1) {
1167 TransposeMicrokernelTester()
1168 .input_stride(1)
1169 .output_stride(8)
1170 .block_width(1)
1171 .block_height(8)
1172 .iterations(1)
1173 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1174 }
1175
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_5_8_bw_1)1176 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_5_8_bw_1){
1177 for(size_t i = 5; i < 8; ++i){
1178 TransposeMicrokernelTester()
1179 .input_stride(1)
1180 .output_stride(i)
1181 .block_width(1)
1182 .block_height(i)
1183 .iterations(1)
1184 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1185 }
1186 }
1187
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_5_8_bw_2)1188 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_5_8_bw_2){
1189 for(size_t i = 5; i < 8; ++i){
1190 TransposeMicrokernelTester()
1191 .input_stride(2)
1192 .output_stride(i)
1193 .block_width(2)
1194 .block_height(i)
1195 .iterations(1)
1196 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1197 }
1198 }
1199
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_5_8_bw_2_2)1200 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_5_8_bw_2_2) {
1201 for(size_t i = 5; i < 8; ++i){
1202 for(size_t j = 2; j < 2; ++j){
1203 TransposeMicrokernelTester()
1204 .input_stride(j)
1205 .output_stride(i)
1206 .block_width(j)
1207 .block_height(i)
1208 .iterations(1)
1209 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1210 }
1211 }
1212 }
1213
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_4_bw_1_is_2)1214 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_4_bw_1_is_2) {
1215 TransposeMicrokernelTester()
1216 .input_stride(2)
1217 .output_stride(4)
1218 .block_width(1)
1219 .block_height(4)
1220 .iterations(1)
1221 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1222 }
1223
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_4_bw_1_os_8)1224 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_4_bw_1_os_8) {
1225 TransposeMicrokernelTester()
1226 .input_stride(1)
1227 .output_stride(8)
1228 .block_width(1)
1229 .block_height(4)
1230 .iterations(1)
1231 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1232 }
1233
TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT,bh_4_bw_1_is_2_os_8)1234 TEST(X64_TRANSPOSE__4X1_SCALAR_FLOAT, bh_4_bw_1_is_2_os_8) {
1235 TransposeMicrokernelTester()
1236 .input_stride(2)
1237 .output_stride(8)
1238 .block_width(1)
1239 .block_height(4)
1240 .iterations(1)
1241 .Test(xnn_x64_transpose_ukernel__4x1_scalar_float);
1242 }
1243
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_4_bw_2)1244 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_4_bw_2) {
1245 TransposeMicrokernelTester()
1246 .input_stride(2)
1247 .output_stride(4)
1248 .block_width(2)
1249 .block_height(4)
1250 .iterations(1)
1251 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1252 }
1253
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_1_8_bw_1_4)1254 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_1_8_bw_1_4) {
1255 for(size_t i = 1; i <= 8; ++i){
1256 for(size_t j = 1; j <= 4; ++j){
1257 TransposeMicrokernelTester()
1258 .input_stride(j)
1259 .output_stride(i)
1260 .block_width(j)
1261 .block_height(i)
1262 .iterations(1)
1263 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1264 }
1265 }
1266 }
1267
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_4_bw_4)1268 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_4_bw_4) {
1269 TransposeMicrokernelTester()
1270 .input_stride(4)
1271 .output_stride(4)
1272 .block_width(4)
1273 .block_height(4)
1274 .iterations(1)
1275 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1276 }
1277
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_4_bw_3_4)1278 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_4_bw_3_4) {
1279 for(size_t i = 3; i < 4; ++i){
1280 TransposeMicrokernelTester()
1281 .input_stride(i)
1282 .output_stride(4)
1283 .block_width(i)
1284 .block_height(4)
1285 .iterations(1)
1286 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1287 }
1288 }
1289
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_8_bw_3_4)1290 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_8_bw_3_4) {
1291 for(size_t i = 3; i < 4; ++i){
1292 TransposeMicrokernelTester()
1293 .input_stride(i)
1294 .output_stride(8)
1295 .block_width(i)
1296 .block_height(8)
1297 .iterations(1)
1298 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1299 }
1300 }
1301
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_8_bw_2)1302 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_8_bw_2) {
1303 TransposeMicrokernelTester()
1304 .input_stride(2)
1305 .output_stride(8)
1306 .block_width(2)
1307 .block_height(8)
1308 .iterations(1)
1309 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1310 }
1311
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_5_8_bw_2)1312 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_5_8_bw_2){
1313 for(size_t i = 5; i < 8; ++i){
1314 TransposeMicrokernelTester()
1315 .input_stride(2)
1316 .output_stride(i)
1317 .block_width(2)
1318 .block_height(i)
1319 .iterations(1)
1320 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1321 }
1322 }
1323
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_5_8_bw_4)1324 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_5_8_bw_4){
1325 for(size_t i = 5; i < 8; ++i){
1326 TransposeMicrokernelTester()
1327 .input_stride(4)
1328 .output_stride(i)
1329 .block_width(4)
1330 .block_height(i)
1331 .iterations(1)
1332 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1333 }
1334 }
1335
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_5_8_bw_3_4)1336 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_5_8_bw_3_4) {
1337 for(size_t i = 5; i < 8; ++i){
1338 for(size_t j = 3; j < 4; ++j){
1339 TransposeMicrokernelTester()
1340 .input_stride(j)
1341 .output_stride(i)
1342 .block_width(j)
1343 .block_height(i)
1344 .iterations(1)
1345 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1346 }
1347 }
1348 }
1349
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_4_bw_2_is_4)1350 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_4_bw_2_is_4) {
1351 TransposeMicrokernelTester()
1352 .input_stride(4)
1353 .output_stride(4)
1354 .block_width(2)
1355 .block_height(4)
1356 .iterations(1)
1357 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1358 }
1359
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_4_bw_2_os_8)1360 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_4_bw_2_os_8) {
1361 TransposeMicrokernelTester()
1362 .input_stride(2)
1363 .output_stride(8)
1364 .block_width(2)
1365 .block_height(4)
1366 .iterations(1)
1367 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1368 }
1369
TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT,bh_4_bw_2_is_4_os_8)1370 TEST(X64_TRANSPOSE__4X2_SCALAR_FLOAT, bh_4_bw_2_is_4_os_8) {
1371 TransposeMicrokernelTester()
1372 .input_stride(4)
1373 .output_stride(8)
1374 .block_width(2)
1375 .block_height(4)
1376 .iterations(1)
1377 .Test(xnn_x64_transpose_ukernel__4x2_scalar_float);
1378 }
1379
1380 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_2_bw_2)1381 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_2) {
1382 TEST_REQUIRES_X86_SSE2;
1383 TransposeMicrokernelTester()
1384 .input_stride(2)
1385 .output_stride(2)
1386 .block_width(2)
1387 .block_height(2)
1388 .iterations(1)
1389 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1390 }
1391
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_1_4_bw_1_4)1392 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_1_4_bw_1_4) {
1393 TEST_REQUIRES_X86_SSE2;
1394 for(size_t i = 1; i <= 4; ++i){
1395 for(size_t j = 1; j <= 4; ++j){
1396 TransposeMicrokernelTester()
1397 .input_stride(j)
1398 .output_stride(i)
1399 .block_width(j)
1400 .block_height(i)
1401 .iterations(1)
1402 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1403 }
1404 }
1405 }
1406
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_2_bw_4)1407 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_4) {
1408 TEST_REQUIRES_X86_SSE2;
1409 TransposeMicrokernelTester()
1410 .input_stride(4)
1411 .output_stride(2)
1412 .block_width(4)
1413 .block_height(2)
1414 .iterations(1)
1415 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1416 }
1417
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_2_bw_3_4)1418 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_3_4) {
1419 TEST_REQUIRES_X86_SSE2;
1420 for(size_t i = 3; i < 4; ++i){
1421 TransposeMicrokernelTester()
1422 .input_stride(i)
1423 .output_stride(2)
1424 .block_width(i)
1425 .block_height(2)
1426 .iterations(1)
1427 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1428 }
1429 }
1430
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_4_bw_3_4)1431 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_4_bw_3_4) {
1432 TEST_REQUIRES_X86_SSE2;
1433 for(size_t i = 3; i < 4; ++i){
1434 TransposeMicrokernelTester()
1435 .input_stride(i)
1436 .output_stride(4)
1437 .block_width(i)
1438 .block_height(4)
1439 .iterations(1)
1440 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1441 }
1442 }
1443
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_4_bw_2)1444 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_4_bw_2) {
1445 TEST_REQUIRES_X86_SSE2;
1446 TransposeMicrokernelTester()
1447 .input_stride(2)
1448 .output_stride(4)
1449 .block_width(2)
1450 .block_height(4)
1451 .iterations(1)
1452 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1453 }
1454
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_3_4_bw_2)1455 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_3_4_bw_2){
1456 TEST_REQUIRES_X86_SSE2;
1457 for(size_t i = 3; i < 4; ++i){
1458 TransposeMicrokernelTester()
1459 .input_stride(2)
1460 .output_stride(i)
1461 .block_width(2)
1462 .block_height(i)
1463 .iterations(1)
1464 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1465 }
1466 }
1467
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_3_4_bw_4)1468 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_3_4_bw_4){
1469 TEST_REQUIRES_X86_SSE2;
1470 for(size_t i = 3; i < 4; ++i){
1471 TransposeMicrokernelTester()
1472 .input_stride(4)
1473 .output_stride(i)
1474 .block_width(4)
1475 .block_height(i)
1476 .iterations(1)
1477 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1478 }
1479 }
1480
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_3_4_bw_3_4)1481 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_3_4_bw_3_4) {
1482 TEST_REQUIRES_X86_SSE2;
1483 for(size_t i = 3; i < 4; ++i){
1484 for(size_t j = 3; j < 4; ++j){
1485 TransposeMicrokernelTester()
1486 .input_stride(j)
1487 .output_stride(i)
1488 .block_width(j)
1489 .block_height(i)
1490 .iterations(1)
1491 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1492 }
1493 }
1494 }
1495
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_2_bw_2_is_4)1496 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_2_is_4) {
1497 TEST_REQUIRES_X86_SSE2;
1498 TransposeMicrokernelTester()
1499 .input_stride(4)
1500 .output_stride(2)
1501 .block_width(2)
1502 .block_height(2)
1503 .iterations(1)
1504 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1505 }
1506
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_2_bw_2_os_4)1507 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_2_os_4) {
1508 TEST_REQUIRES_X86_SSE2;
1509 TransposeMicrokernelTester()
1510 .input_stride(2)
1511 .output_stride(4)
1512 .block_width(2)
1513 .block_height(2)
1514 .iterations(1)
1515 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1516 }
1517
TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2,bh_2_bw_2_is_4_os_4)1518 TEST(X64_TRANSPOSE__2X2_MULTI_MOV_SSE2, bh_2_bw_2_is_4_os_4) {
1519 TEST_REQUIRES_X86_SSE2;
1520 TransposeMicrokernelTester()
1521 .input_stride(4)
1522 .output_stride(4)
1523 .block_width(2)
1524 .block_height(2)
1525 .iterations(1)
1526 .Test(xnn_x64_transpose_ukernel__2x2_multi_mov_sse2);
1527 }
1528 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1529
1530
1531 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_2_bw_2)1532 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_2_bw_2) {
1533 TEST_REQUIRES_X86_SSE2;
1534 TransposeMicrokernelTester()
1535 .input_stride(2)
1536 .output_stride(2)
1537 .block_width(2)
1538 .block_height(2)
1539 .iterations(1)
1540 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1541 }
1542
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_1_4_bw_1_4)1543 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_1_4_bw_1_4) {
1544 TEST_REQUIRES_X86_SSE2;
1545 for(size_t i = 1; i <= 4; ++i){
1546 for(size_t j = 1; j <= 4; ++j){
1547 TransposeMicrokernelTester()
1548 .input_stride(j)
1549 .output_stride(i)
1550 .block_width(j)
1551 .block_height(i)
1552 .iterations(1)
1553 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1554 }
1555 }
1556 }
1557
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_2_bw_4)1558 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_2_bw_4) {
1559 TEST_REQUIRES_X86_SSE2;
1560 TransposeMicrokernelTester()
1561 .input_stride(4)
1562 .output_stride(2)
1563 .block_width(4)
1564 .block_height(2)
1565 .iterations(1)
1566 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1567 }
1568
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_2_bw_3_4)1569 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_2_bw_3_4) {
1570 TEST_REQUIRES_X86_SSE2;
1571 for(size_t i = 3; i < 4; ++i){
1572 TransposeMicrokernelTester()
1573 .input_stride(i)
1574 .output_stride(2)
1575 .block_width(i)
1576 .block_height(2)
1577 .iterations(1)
1578 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1579 }
1580 }
1581
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_4_bw_3_4)1582 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_4_bw_3_4) {
1583 TEST_REQUIRES_X86_SSE2;
1584 for(size_t i = 3; i < 4; ++i){
1585 TransposeMicrokernelTester()
1586 .input_stride(i)
1587 .output_stride(4)
1588 .block_width(i)
1589 .block_height(4)
1590 .iterations(1)
1591 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1592 }
1593 }
1594
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_4_bw_2)1595 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_4_bw_2) {
1596 TEST_REQUIRES_X86_SSE2;
1597 TransposeMicrokernelTester()
1598 .input_stride(2)
1599 .output_stride(4)
1600 .block_width(2)
1601 .block_height(4)
1602 .iterations(1)
1603 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1604 }
1605
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_3_4_bw_2)1606 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_3_4_bw_2){
1607 TEST_REQUIRES_X86_SSE2;
1608 for(size_t i = 3; i < 4; ++i){
1609 TransposeMicrokernelTester()
1610 .input_stride(2)
1611 .output_stride(i)
1612 .block_width(2)
1613 .block_height(i)
1614 .iterations(1)
1615 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1616 }
1617 }
1618
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_3_4_bw_4)1619 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_3_4_bw_4){
1620 TEST_REQUIRES_X86_SSE2;
1621 for(size_t i = 3; i < 4; ++i){
1622 TransposeMicrokernelTester()
1623 .input_stride(4)
1624 .output_stride(i)
1625 .block_width(4)
1626 .block_height(i)
1627 .iterations(1)
1628 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1629 }
1630 }
1631
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_3_4_bw_3_4)1632 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_3_4_bw_3_4) {
1633 TEST_REQUIRES_X86_SSE2;
1634 for(size_t i = 3; i < 4; ++i){
1635 for(size_t j = 3; j < 4; ++j){
1636 TransposeMicrokernelTester()
1637 .input_stride(j)
1638 .output_stride(i)
1639 .block_width(j)
1640 .block_height(i)
1641 .iterations(1)
1642 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1643 }
1644 }
1645 }
1646
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_2_bw_2_is_4)1647 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_2_bw_2_is_4) {
1648 TEST_REQUIRES_X86_SSE2;
1649 TransposeMicrokernelTester()
1650 .input_stride(4)
1651 .output_stride(2)
1652 .block_width(2)
1653 .block_height(2)
1654 .iterations(1)
1655 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1656 }
1657
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_2_bw_2_os_4)1658 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_2_bw_2_os_4) {
1659 TEST_REQUIRES_X86_SSE2;
1660 TransposeMicrokernelTester()
1661 .input_stride(2)
1662 .output_stride(4)
1663 .block_width(2)
1664 .block_height(2)
1665 .iterations(1)
1666 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1667 }
1668
TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2,bh_2_bw_2_is_4_os_4)1669 TEST(X64_TRANSPOSE__2X2_MULTI_MULTI_SSE2, bh_2_bw_2_is_4_os_4) {
1670 TEST_REQUIRES_X86_SSE2;
1671 TransposeMicrokernelTester()
1672 .input_stride(4)
1673 .output_stride(4)
1674 .block_width(2)
1675 .block_height(2)
1676 .iterations(1)
1677 .Test(xnn_x64_transpose_ukernel__2x2_multi_multi_sse2);
1678 }
1679 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1680
1681
1682 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_2_bw_2)1683 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_2_bw_2) {
1684 TEST_REQUIRES_X86_SSE2;
1685 TransposeMicrokernelTester()
1686 .input_stride(2)
1687 .output_stride(2)
1688 .block_width(2)
1689 .block_height(2)
1690 .iterations(1)
1691 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1692 }
1693
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_1_4_bw_1_4)1694 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_1_4_bw_1_4) {
1695 TEST_REQUIRES_X86_SSE2;
1696 for(size_t i = 1; i <= 4; ++i){
1697 for(size_t j = 1; j <= 4; ++j){
1698 TransposeMicrokernelTester()
1699 .input_stride(j)
1700 .output_stride(i)
1701 .block_width(j)
1702 .block_height(i)
1703 .iterations(1)
1704 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1705 }
1706 }
1707 }
1708
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_2_bw_4)1709 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_2_bw_4) {
1710 TEST_REQUIRES_X86_SSE2;
1711 TransposeMicrokernelTester()
1712 .input_stride(4)
1713 .output_stride(2)
1714 .block_width(4)
1715 .block_height(2)
1716 .iterations(1)
1717 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1718 }
1719
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_2_bw_3_4)1720 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_2_bw_3_4) {
1721 TEST_REQUIRES_X86_SSE2;
1722 for(size_t i = 3; i < 4; ++i){
1723 TransposeMicrokernelTester()
1724 .input_stride(i)
1725 .output_stride(2)
1726 .block_width(i)
1727 .block_height(2)
1728 .iterations(1)
1729 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1730 }
1731 }
1732
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_4_bw_3_4)1733 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_4_bw_3_4) {
1734 TEST_REQUIRES_X86_SSE2;
1735 for(size_t i = 3; i < 4; ++i){
1736 TransposeMicrokernelTester()
1737 .input_stride(i)
1738 .output_stride(4)
1739 .block_width(i)
1740 .block_height(4)
1741 .iterations(1)
1742 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1743 }
1744 }
1745
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_4_bw_2)1746 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_4_bw_2) {
1747 TEST_REQUIRES_X86_SSE2;
1748 TransposeMicrokernelTester()
1749 .input_stride(2)
1750 .output_stride(4)
1751 .block_width(2)
1752 .block_height(4)
1753 .iterations(1)
1754 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1755 }
1756
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_3_4_bw_2)1757 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_3_4_bw_2){
1758 TEST_REQUIRES_X86_SSE2;
1759 for(size_t i = 3; i < 4; ++i){
1760 TransposeMicrokernelTester()
1761 .input_stride(2)
1762 .output_stride(i)
1763 .block_width(2)
1764 .block_height(i)
1765 .iterations(1)
1766 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1767 }
1768 }
1769
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_3_4_bw_4)1770 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_3_4_bw_4){
1771 TEST_REQUIRES_X86_SSE2;
1772 for(size_t i = 3; i < 4; ++i){
1773 TransposeMicrokernelTester()
1774 .input_stride(4)
1775 .output_stride(i)
1776 .block_width(4)
1777 .block_height(i)
1778 .iterations(1)
1779 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1780 }
1781 }
1782
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_3_4_bw_3_4)1783 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_3_4_bw_3_4) {
1784 TEST_REQUIRES_X86_SSE2;
1785 for(size_t i = 3; i < 4; ++i){
1786 for(size_t j = 3; j < 4; ++j){
1787 TransposeMicrokernelTester()
1788 .input_stride(j)
1789 .output_stride(i)
1790 .block_width(j)
1791 .block_height(i)
1792 .iterations(1)
1793 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1794 }
1795 }
1796 }
1797
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_2_bw_2_is_4)1798 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_2_bw_2_is_4) {
1799 TEST_REQUIRES_X86_SSE2;
1800 TransposeMicrokernelTester()
1801 .input_stride(4)
1802 .output_stride(2)
1803 .block_width(2)
1804 .block_height(2)
1805 .iterations(1)
1806 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1807 }
1808
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_2_bw_2_os_4)1809 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_2_bw_2_os_4) {
1810 TEST_REQUIRES_X86_SSE2;
1811 TransposeMicrokernelTester()
1812 .input_stride(2)
1813 .output_stride(4)
1814 .block_width(2)
1815 .block_height(2)
1816 .iterations(1)
1817 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1818 }
1819
TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2,bh_2_bw_2_is_4_os_4)1820 TEST(X64_TRANSPOSE__2X2_MULTI_SWITCH_SSE2, bh_2_bw_2_is_4_os_4) {
1821 TEST_REQUIRES_X86_SSE2;
1822 TransposeMicrokernelTester()
1823 .input_stride(4)
1824 .output_stride(4)
1825 .block_width(2)
1826 .block_height(2)
1827 .iterations(1)
1828 .Test(xnn_x64_transpose_ukernel__2x2_multi_switch_sse2);
1829 }
1830 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1831
1832
1833 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_2_bw_2)1834 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_2) {
1835 TEST_REQUIRES_X86_SSE2;
1836 TransposeMicrokernelTester()
1837 .input_stride(2)
1838 .output_stride(2)
1839 .block_width(2)
1840 .block_height(2)
1841 .iterations(1)
1842 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1843 }
1844
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_1_4_bw_1_4)1845 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_1_4_bw_1_4) {
1846 TEST_REQUIRES_X86_SSE2;
1847 for(size_t i = 1; i <= 4; ++i){
1848 for(size_t j = 1; j <= 4; ++j){
1849 TransposeMicrokernelTester()
1850 .input_stride(j)
1851 .output_stride(i)
1852 .block_width(j)
1853 .block_height(i)
1854 .iterations(1)
1855 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1856 }
1857 }
1858 }
1859
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_2_bw_4)1860 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_4) {
1861 TEST_REQUIRES_X86_SSE2;
1862 TransposeMicrokernelTester()
1863 .input_stride(4)
1864 .output_stride(2)
1865 .block_width(4)
1866 .block_height(2)
1867 .iterations(1)
1868 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1869 }
1870
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_2_bw_3_4)1871 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_3_4) {
1872 TEST_REQUIRES_X86_SSE2;
1873 for(size_t i = 3; i < 4; ++i){
1874 TransposeMicrokernelTester()
1875 .input_stride(i)
1876 .output_stride(2)
1877 .block_width(i)
1878 .block_height(2)
1879 .iterations(1)
1880 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1881 }
1882 }
1883
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_4_bw_3_4)1884 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_4_bw_3_4) {
1885 TEST_REQUIRES_X86_SSE2;
1886 for(size_t i = 3; i < 4; ++i){
1887 TransposeMicrokernelTester()
1888 .input_stride(i)
1889 .output_stride(4)
1890 .block_width(i)
1891 .block_height(4)
1892 .iterations(1)
1893 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1894 }
1895 }
1896
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_4_bw_2)1897 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_4_bw_2) {
1898 TEST_REQUIRES_X86_SSE2;
1899 TransposeMicrokernelTester()
1900 .input_stride(2)
1901 .output_stride(4)
1902 .block_width(2)
1903 .block_height(4)
1904 .iterations(1)
1905 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1906 }
1907
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_3_4_bw_2)1908 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_3_4_bw_2){
1909 TEST_REQUIRES_X86_SSE2;
1910 for(size_t i = 3; i < 4; ++i){
1911 TransposeMicrokernelTester()
1912 .input_stride(2)
1913 .output_stride(i)
1914 .block_width(2)
1915 .block_height(i)
1916 .iterations(1)
1917 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1918 }
1919 }
1920
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_3_4_bw_4)1921 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_3_4_bw_4){
1922 TEST_REQUIRES_X86_SSE2;
1923 for(size_t i = 3; i < 4; ++i){
1924 TransposeMicrokernelTester()
1925 .input_stride(4)
1926 .output_stride(i)
1927 .block_width(4)
1928 .block_height(i)
1929 .iterations(1)
1930 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1931 }
1932 }
1933
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_3_4_bw_3_4)1934 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_3_4_bw_3_4) {
1935 TEST_REQUIRES_X86_SSE2;
1936 for(size_t i = 3; i < 4; ++i){
1937 for(size_t j = 3; j < 4; ++j){
1938 TransposeMicrokernelTester()
1939 .input_stride(j)
1940 .output_stride(i)
1941 .block_width(j)
1942 .block_height(i)
1943 .iterations(1)
1944 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1945 }
1946 }
1947 }
1948
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_2_bw_2_is_4)1949 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_2_is_4) {
1950 TEST_REQUIRES_X86_SSE2;
1951 TransposeMicrokernelTester()
1952 .input_stride(4)
1953 .output_stride(2)
1954 .block_width(2)
1955 .block_height(2)
1956 .iterations(1)
1957 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1958 }
1959
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_2_bw_2_os_4)1960 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_2_os_4) {
1961 TEST_REQUIRES_X86_SSE2;
1962 TransposeMicrokernelTester()
1963 .input_stride(2)
1964 .output_stride(4)
1965 .block_width(2)
1966 .block_height(2)
1967 .iterations(1)
1968 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1969 }
1970
TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2,bh_2_bw_2_is_4_os_4)1971 TEST(X64_TRANSPOSE__2X2_REUSE_MOV_SSE2, bh_2_bw_2_is_4_os_4) {
1972 TEST_REQUIRES_X86_SSE2;
1973 TransposeMicrokernelTester()
1974 .input_stride(4)
1975 .output_stride(4)
1976 .block_width(2)
1977 .block_height(2)
1978 .iterations(1)
1979 .Test(xnn_x64_transpose_ukernel__2x2_reuse_mov_sse2);
1980 }
1981 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1982
1983
1984 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_2_bw_2)1985 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_2_bw_2) {
1986 TEST_REQUIRES_X86_SSE2;
1987 TransposeMicrokernelTester()
1988 .input_stride(2)
1989 .output_stride(2)
1990 .block_width(2)
1991 .block_height(2)
1992 .iterations(1)
1993 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
1994 }
1995
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_1_4_bw_1_4)1996 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_1_4_bw_1_4) {
1997 TEST_REQUIRES_X86_SSE2;
1998 for(size_t i = 1; i <= 4; ++i){
1999 for(size_t j = 1; j <= 4; ++j){
2000 TransposeMicrokernelTester()
2001 .input_stride(j)
2002 .output_stride(i)
2003 .block_width(j)
2004 .block_height(i)
2005 .iterations(1)
2006 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2007 }
2008 }
2009 }
2010
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_2_bw_4)2011 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_2_bw_4) {
2012 TEST_REQUIRES_X86_SSE2;
2013 TransposeMicrokernelTester()
2014 .input_stride(4)
2015 .output_stride(2)
2016 .block_width(4)
2017 .block_height(2)
2018 .iterations(1)
2019 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2020 }
2021
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_2_bw_3_4)2022 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_2_bw_3_4) {
2023 TEST_REQUIRES_X86_SSE2;
2024 for(size_t i = 3; i < 4; ++i){
2025 TransposeMicrokernelTester()
2026 .input_stride(i)
2027 .output_stride(2)
2028 .block_width(i)
2029 .block_height(2)
2030 .iterations(1)
2031 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2032 }
2033 }
2034
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_4_bw_3_4)2035 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_4_bw_3_4) {
2036 TEST_REQUIRES_X86_SSE2;
2037 for(size_t i = 3; i < 4; ++i){
2038 TransposeMicrokernelTester()
2039 .input_stride(i)
2040 .output_stride(4)
2041 .block_width(i)
2042 .block_height(4)
2043 .iterations(1)
2044 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2045 }
2046 }
2047
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_4_bw_2)2048 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_4_bw_2) {
2049 TEST_REQUIRES_X86_SSE2;
2050 TransposeMicrokernelTester()
2051 .input_stride(2)
2052 .output_stride(4)
2053 .block_width(2)
2054 .block_height(4)
2055 .iterations(1)
2056 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2057 }
2058
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_3_4_bw_2)2059 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_3_4_bw_2){
2060 TEST_REQUIRES_X86_SSE2;
2061 for(size_t i = 3; i < 4; ++i){
2062 TransposeMicrokernelTester()
2063 .input_stride(2)
2064 .output_stride(i)
2065 .block_width(2)
2066 .block_height(i)
2067 .iterations(1)
2068 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2069 }
2070 }
2071
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_3_4_bw_4)2072 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_3_4_bw_4){
2073 TEST_REQUIRES_X86_SSE2;
2074 for(size_t i = 3; i < 4; ++i){
2075 TransposeMicrokernelTester()
2076 .input_stride(4)
2077 .output_stride(i)
2078 .block_width(4)
2079 .block_height(i)
2080 .iterations(1)
2081 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2082 }
2083 }
2084
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_3_4_bw_3_4)2085 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_3_4_bw_3_4) {
2086 TEST_REQUIRES_X86_SSE2;
2087 for(size_t i = 3; i < 4; ++i){
2088 for(size_t j = 3; j < 4; ++j){
2089 TransposeMicrokernelTester()
2090 .input_stride(j)
2091 .output_stride(i)
2092 .block_width(j)
2093 .block_height(i)
2094 .iterations(1)
2095 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2096 }
2097 }
2098 }
2099
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_2_bw_2_is_4)2100 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_2_bw_2_is_4) {
2101 TEST_REQUIRES_X86_SSE2;
2102 TransposeMicrokernelTester()
2103 .input_stride(4)
2104 .output_stride(2)
2105 .block_width(2)
2106 .block_height(2)
2107 .iterations(1)
2108 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2109 }
2110
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_2_bw_2_os_4)2111 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_2_bw_2_os_4) {
2112 TEST_REQUIRES_X86_SSE2;
2113 TransposeMicrokernelTester()
2114 .input_stride(2)
2115 .output_stride(4)
2116 .block_width(2)
2117 .block_height(2)
2118 .iterations(1)
2119 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2120 }
2121
TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2,bh_2_bw_2_is_4_os_4)2122 TEST(X64_TRANSPOSE__2X2_REUSE_MULTI_SSE2, bh_2_bw_2_is_4_os_4) {
2123 TEST_REQUIRES_X86_SSE2;
2124 TransposeMicrokernelTester()
2125 .input_stride(4)
2126 .output_stride(4)
2127 .block_width(2)
2128 .block_height(2)
2129 .iterations(1)
2130 .Test(xnn_x64_transpose_ukernel__2x2_reuse_multi_sse2);
2131 }
2132 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2133
2134
2135 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_2_bw_2)2136 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_2_bw_2) {
2137 TEST_REQUIRES_X86_SSE2;
2138 TransposeMicrokernelTester()
2139 .input_stride(2)
2140 .output_stride(2)
2141 .block_width(2)
2142 .block_height(2)
2143 .iterations(1)
2144 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2145 }
2146
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_1_4_bw_1_4)2147 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_1_4_bw_1_4) {
2148 TEST_REQUIRES_X86_SSE2;
2149 for(size_t i = 1; i <= 4; ++i){
2150 for(size_t j = 1; j <= 4; ++j){
2151 TransposeMicrokernelTester()
2152 .input_stride(j)
2153 .output_stride(i)
2154 .block_width(j)
2155 .block_height(i)
2156 .iterations(1)
2157 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2158 }
2159 }
2160 }
2161
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_2_bw_4)2162 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_2_bw_4) {
2163 TEST_REQUIRES_X86_SSE2;
2164 TransposeMicrokernelTester()
2165 .input_stride(4)
2166 .output_stride(2)
2167 .block_width(4)
2168 .block_height(2)
2169 .iterations(1)
2170 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2171 }
2172
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_2_bw_3_4)2173 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_2_bw_3_4) {
2174 TEST_REQUIRES_X86_SSE2;
2175 for(size_t i = 3; i < 4; ++i){
2176 TransposeMicrokernelTester()
2177 .input_stride(i)
2178 .output_stride(2)
2179 .block_width(i)
2180 .block_height(2)
2181 .iterations(1)
2182 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2183 }
2184 }
2185
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_4_bw_3_4)2186 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_4_bw_3_4) {
2187 TEST_REQUIRES_X86_SSE2;
2188 for(size_t i = 3; i < 4; ++i){
2189 TransposeMicrokernelTester()
2190 .input_stride(i)
2191 .output_stride(4)
2192 .block_width(i)
2193 .block_height(4)
2194 .iterations(1)
2195 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2196 }
2197 }
2198
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_4_bw_2)2199 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_4_bw_2) {
2200 TEST_REQUIRES_X86_SSE2;
2201 TransposeMicrokernelTester()
2202 .input_stride(2)
2203 .output_stride(4)
2204 .block_width(2)
2205 .block_height(4)
2206 .iterations(1)
2207 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2208 }
2209
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_3_4_bw_2)2210 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_3_4_bw_2){
2211 TEST_REQUIRES_X86_SSE2;
2212 for(size_t i = 3; i < 4; ++i){
2213 TransposeMicrokernelTester()
2214 .input_stride(2)
2215 .output_stride(i)
2216 .block_width(2)
2217 .block_height(i)
2218 .iterations(1)
2219 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2220 }
2221 }
2222
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_3_4_bw_4)2223 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_3_4_bw_4){
2224 TEST_REQUIRES_X86_SSE2;
2225 for(size_t i = 3; i < 4; ++i){
2226 TransposeMicrokernelTester()
2227 .input_stride(4)
2228 .output_stride(i)
2229 .block_width(4)
2230 .block_height(i)
2231 .iterations(1)
2232 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2233 }
2234 }
2235
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_3_4_bw_3_4)2236 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_3_4_bw_3_4) {
2237 TEST_REQUIRES_X86_SSE2;
2238 for(size_t i = 3; i < 4; ++i){
2239 for(size_t j = 3; j < 4; ++j){
2240 TransposeMicrokernelTester()
2241 .input_stride(j)
2242 .output_stride(i)
2243 .block_width(j)
2244 .block_height(i)
2245 .iterations(1)
2246 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2247 }
2248 }
2249 }
2250
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_2_bw_2_is_4)2251 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_2_bw_2_is_4) {
2252 TEST_REQUIRES_X86_SSE2;
2253 TransposeMicrokernelTester()
2254 .input_stride(4)
2255 .output_stride(2)
2256 .block_width(2)
2257 .block_height(2)
2258 .iterations(1)
2259 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2260 }
2261
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_2_bw_2_os_4)2262 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_2_bw_2_os_4) {
2263 TEST_REQUIRES_X86_SSE2;
2264 TransposeMicrokernelTester()
2265 .input_stride(2)
2266 .output_stride(4)
2267 .block_width(2)
2268 .block_height(2)
2269 .iterations(1)
2270 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2271 }
2272
TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2,bh_2_bw_2_is_4_os_4)2273 TEST(X64_TRANSPOSE__2X2_REUSE_SWITCH_SSE2, bh_2_bw_2_is_4_os_4) {
2274 TEST_REQUIRES_X86_SSE2;
2275 TransposeMicrokernelTester()
2276 .input_stride(4)
2277 .output_stride(4)
2278 .block_width(2)
2279 .block_height(2)
2280 .iterations(1)
2281 .Test(xnn_x64_transpose_ukernel__2x2_reuse_switch_sse2);
2282 }
2283 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2284