1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 // 9 // Auto-generated file. Do not edit! 10 // Specification: test/f16-gemm-minmax.yaml 11 // Generator: tools/generate-gemm-test.py 12 13 14 #include <gtest/gtest.h> 15 16 #include <xnnpack/common.h> 17 #include <xnnpack/isa-checks.h> 18 19 #include <xnnpack/gemm.h> 20 #include <xnnpack/igemm.h> 21 #include <xnnpack/ppmm.h> 22 #include "gemm-microkernel-tester.h" 23 24 25 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4)26 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4) { 27 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 28 GemmMicrokernelTester() 29 .mr(1) 30 .nr(8) 31 .kr(1) 32 .sr(1) 33 .m(1) 34 .n(8) 35 .k(4) 36 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 37 } 38 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cn)39 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cn) { 40 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 41 GemmMicrokernelTester() 42 .mr(1) 43 .nr(8) 44 .kr(1) 45 .sr(1) 46 .m(1) 47 .n(8) 48 .k(4) 49 .cn_stride(11) 50 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 51 } 52 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_strided_a)53 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 54 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 55 GemmMicrokernelTester() 56 .mr(1) 57 .nr(8) 58 .kr(1) 59 .sr(1) 60 .m(1) 61 .n(8) 62 .k(4) 63 .a_stride(7) 64 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 65 } 66 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile)67 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 68 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 69 for (uint32_t m = 1; m <= 1; m++) { 70 for (uint32_t n = 1; n <= 8; n++) { 71 GemmMicrokernelTester() 72 .mr(1) 73 .nr(8) 74 .kr(1) 75 .sr(1) 76 .m(m) 77 .n(n) 78 .k(4) 79 .iterations(1) 80 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 81 } 82 } 83 } 84 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)85 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 86 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 87 for (uint32_t m = 1; m <= 1; m++) { 88 GemmMicrokernelTester() 89 .mr(1) 90 .nr(8) 91 .kr(1) 92 .sr(1) 93 .m(m) 94 .n(8) 95 .k(4) 96 .iterations(1) 97 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 98 } 99 } 100 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)101 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 102 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 103 for (uint32_t n = 1; n <= 8; n++) { 104 GemmMicrokernelTester() 105 .mr(1) 106 .nr(8) 107 .kr(1) 108 .sr(1) 109 .m(1) 110 .n(n) 111 .k(4) 112 .iterations(1) 113 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 114 } 115 } 116 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_lt_4)117 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4) { 118 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 119 for (size_t k = 1; k < 4; k++) { 120 GemmMicrokernelTester() 121 .mr(1) 122 .nr(8) 123 .kr(1) 124 .sr(1) 125 .m(1) 126 .n(8) 127 .k(k) 128 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 129 } 130 } 131 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_lt_4_strided_a)132 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 133 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 134 for (size_t k = 1; k < 4; k++) { 135 GemmMicrokernelTester() 136 .mr(1) 137 .nr(8) 138 .kr(1) 139 .sr(1) 140 .m(1) 141 .n(8) 142 .k(k) 143 .a_stride(7) 144 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 145 } 146 } 147 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_lt_4_subtile)148 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 149 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 150 for (size_t k = 1; k < 4; k++) { 151 for (uint32_t m = 1; m <= 1; m++) { 152 for (uint32_t n = 1; n <= 8; n++) { 153 GemmMicrokernelTester() 154 .mr(1) 155 .nr(8) 156 .kr(1) 157 .sr(1) 158 .m(m) 159 .n(n) 160 .k(k) 161 .iterations(1) 162 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 163 } 164 } 165 } 166 } 167 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_gt_4)168 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4) { 169 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 170 for (size_t k = 5; k < 8; k++) { 171 GemmMicrokernelTester() 172 .mr(1) 173 .nr(8) 174 .kr(1) 175 .sr(1) 176 .m(1) 177 .n(8) 178 .k(k) 179 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 180 } 181 } 182 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_gt_4_strided_a)183 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 184 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 185 for (size_t k = 5; k < 8; k++) { 186 GemmMicrokernelTester() 187 .mr(1) 188 .nr(8) 189 .kr(1) 190 .sr(1) 191 .m(1) 192 .n(8) 193 .k(k) 194 .a_stride(11) 195 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 196 } 197 } 198 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_gt_4_subtile)199 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 200 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 201 for (size_t k = 5; k < 8; k++) { 202 for (uint32_t m = 1; m <= 1; m++) { 203 for (uint32_t n = 1; n <= 8; n++) { 204 GemmMicrokernelTester() 205 .mr(1) 206 .nr(8) 207 .kr(1) 208 .sr(1) 209 .m(m) 210 .n(n) 211 .k(k) 212 .iterations(1) 213 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 214 } 215 } 216 } 217 } 218 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_div_4)219 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4) { 220 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 221 for (size_t k = 8; k <= 40; k += 4) { 222 GemmMicrokernelTester() 223 .mr(1) 224 .nr(8) 225 .kr(1) 226 .sr(1) 227 .m(1) 228 .n(8) 229 .k(k) 230 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 231 } 232 } 233 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_div_4_strided_a)234 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4_strided_a) { 235 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 236 for (size_t k = 8; k <= 40; k += 4) { 237 GemmMicrokernelTester() 238 .mr(1) 239 .nr(8) 240 .kr(1) 241 .sr(1) 242 .m(1) 243 .n(8) 244 .k(k) 245 .a_stride(43) 246 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 247 } 248 } 249 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_div_4_subtile)250 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 251 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 252 for (size_t k = 8; k <= 40; k += 4) { 253 for (uint32_t m = 1; m <= 1; m++) { 254 for (uint32_t n = 1; n <= 8; n++) { 255 GemmMicrokernelTester() 256 .mr(1) 257 .nr(8) 258 .kr(1) 259 .sr(1) 260 .m(m) 261 .n(n) 262 .k(k) 263 .iterations(1) 264 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 265 } 266 } 267 } 268 } 269 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8)270 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8) { 271 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 272 for (uint32_t n = 9; n < 16; n++) { 273 for (size_t k = 1; k <= 20; k += 5) { 274 GemmMicrokernelTester() 275 .mr(1) 276 .nr(8) 277 .kr(1) 278 .sr(1) 279 .m(1) 280 .n(8) 281 .k(k) 282 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 283 } 284 } 285 } 286 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)287 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 288 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 289 for (uint32_t n = 9; n < 16; n++) { 290 for (size_t k = 1; k <= 20; k += 5) { 291 GemmMicrokernelTester() 292 .mr(1) 293 .nr(8) 294 .kr(1) 295 .sr(1) 296 .m(1) 297 .n(8) 298 .k(k) 299 .cn_stride(11) 300 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 301 } 302 } 303 } 304 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_strided_a)305 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) { 306 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 307 for (uint32_t n = 9; n < 16; n++) { 308 for (size_t k = 1; k <= 20; k += 5) { 309 GemmMicrokernelTester() 310 .mr(1) 311 .nr(8) 312 .kr(1) 313 .sr(1) 314 .m(1) 315 .n(n) 316 .k(k) 317 .a_stride(23) 318 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 319 } 320 } 321 } 322 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_subtile)323 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 324 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 325 for (uint32_t n = 9; n < 16; n++) { 326 for (size_t k = 1; k <= 20; k += 5) { 327 for (uint32_t m = 1; m <= 1; m++) { 328 GemmMicrokernelTester() 329 .mr(1) 330 .nr(8) 331 .kr(1) 332 .sr(1) 333 .m(m) 334 .n(n) 335 .k(k) 336 .iterations(1) 337 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 338 } 339 } 340 } 341 } 342 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8)343 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8) { 344 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 345 for (uint32_t n = 16; n <= 24; n += 8) { 346 for (size_t k = 1; k <= 20; k += 5) { 347 GemmMicrokernelTester() 348 .mr(1) 349 .nr(8) 350 .kr(1) 351 .sr(1) 352 .m(1) 353 .n(8) 354 .k(k) 355 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 356 } 357 } 358 } 359 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)360 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 361 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 362 for (uint32_t n = 16; n <= 24; n += 8) { 363 for (size_t k = 1; k <= 20; k += 5) { 364 GemmMicrokernelTester() 365 .mr(1) 366 .nr(8) 367 .kr(1) 368 .sr(1) 369 .m(1) 370 .n(n) 371 .k(k) 372 .cn_stride(11) 373 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 374 } 375 } 376 } 377 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_strided_a)378 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_strided_a) { 379 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 380 for (uint32_t n = 16; n <= 24; n += 8) { 381 for (size_t k = 1; k <= 20; k += 5) { 382 GemmMicrokernelTester() 383 .mr(1) 384 .nr(8) 385 .kr(1) 386 .sr(1) 387 .m(1) 388 .n(n) 389 .k(k) 390 .a_stride(23) 391 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 392 } 393 } 394 } 395 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_subtile)396 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 397 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 398 for (uint32_t n = 16; n <= 24; n += 8) { 399 for (size_t k = 1; k <= 20; k += 5) { 400 for (uint32_t m = 1; m <= 1; m++) { 401 GemmMicrokernelTester() 402 .mr(1) 403 .nr(8) 404 .kr(1) 405 .sr(1) 406 .m(m) 407 .n(n) 408 .k(k) 409 .iterations(1) 410 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 411 } 412 } 413 } 414 } 415 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cm_subtile)416 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 417 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 418 for (size_t k = 1; k <= 20; k += 5) { 419 for (uint32_t m = 1; m <= 1; m++) { 420 for (uint32_t n = 1; n <= 8; n++) { 421 GemmMicrokernelTester() 422 .mr(1) 423 .nr(8) 424 .kr(1) 425 .sr(1) 426 .m(m) 427 .n(n) 428 .k(k) 429 .cm_stride(11) 430 .iterations(1) 431 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 432 } 433 } 434 } 435 } 436 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,qmin)437 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmin) { 438 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 439 GemmMicrokernelTester() 440 .mr(1) 441 .nr(8) 442 .kr(1) 443 .sr(1) 444 .m(1) 445 .n(8) 446 .k(4) 447 .qmin(128) 448 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 449 } 450 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,qmax)451 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmax) { 452 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 453 GemmMicrokernelTester() 454 .mr(1) 455 .nr(8) 456 .kr(1) 457 .sr(1) 458 .m(1) 459 .n(8) 460 .k(4) 461 .qmax(128) 462 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 463 } 464 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cm)465 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm) { 466 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 467 GemmMicrokernelTester() 468 .mr(1) 469 .nr(8) 470 .kr(1) 471 .sr(1) 472 .m(1) 473 .n(8) 474 .k(4) 475 .cm_stride(11) 476 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64); 477 } 478 #endif // XNN_ARCH_ARM64 479 480 481 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4)482 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4) { 483 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 484 GemmMicrokernelTester() 485 .mr(4) 486 .nr(8) 487 .kr(1) 488 .sr(1) 489 .m(4) 490 .n(8) 491 .k(4) 492 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 493 } 494 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cn)495 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cn) { 496 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 497 GemmMicrokernelTester() 498 .mr(4) 499 .nr(8) 500 .kr(1) 501 .sr(1) 502 .m(4) 503 .n(8) 504 .k(4) 505 .cn_stride(11) 506 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 507 } 508 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_strided_a)509 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 510 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 511 GemmMicrokernelTester() 512 .mr(4) 513 .nr(8) 514 .kr(1) 515 .sr(1) 516 .m(4) 517 .n(8) 518 .k(4) 519 .a_stride(7) 520 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 521 } 522 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile)523 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 524 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 525 for (uint32_t m = 1; m <= 4; m++) { 526 for (uint32_t n = 1; n <= 8; n++) { 527 GemmMicrokernelTester() 528 .mr(4) 529 .nr(8) 530 .kr(1) 531 .sr(1) 532 .m(m) 533 .n(n) 534 .k(4) 535 .iterations(1) 536 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 537 } 538 } 539 } 540 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)541 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 542 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 543 for (uint32_t m = 1; m <= 4; m++) { 544 GemmMicrokernelTester() 545 .mr(4) 546 .nr(8) 547 .kr(1) 548 .sr(1) 549 .m(m) 550 .n(8) 551 .k(4) 552 .iterations(1) 553 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 554 } 555 } 556 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)557 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 558 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 559 for (uint32_t n = 1; n <= 8; n++) { 560 GemmMicrokernelTester() 561 .mr(4) 562 .nr(8) 563 .kr(1) 564 .sr(1) 565 .m(4) 566 .n(n) 567 .k(4) 568 .iterations(1) 569 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 570 } 571 } 572 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_lt_4)573 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4) { 574 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 575 for (size_t k = 1; k < 4; k++) { 576 GemmMicrokernelTester() 577 .mr(4) 578 .nr(8) 579 .kr(1) 580 .sr(1) 581 .m(4) 582 .n(8) 583 .k(k) 584 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 585 } 586 } 587 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_lt_4_strided_a)588 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 589 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 590 for (size_t k = 1; k < 4; k++) { 591 GemmMicrokernelTester() 592 .mr(4) 593 .nr(8) 594 .kr(1) 595 .sr(1) 596 .m(4) 597 .n(8) 598 .k(k) 599 .a_stride(7) 600 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 601 } 602 } 603 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_lt_4_subtile)604 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 605 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 606 for (size_t k = 1; k < 4; k++) { 607 for (uint32_t m = 1; m <= 4; m++) { 608 for (uint32_t n = 1; n <= 8; n++) { 609 GemmMicrokernelTester() 610 .mr(4) 611 .nr(8) 612 .kr(1) 613 .sr(1) 614 .m(m) 615 .n(n) 616 .k(k) 617 .iterations(1) 618 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 619 } 620 } 621 } 622 } 623 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_gt_4)624 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4) { 625 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 626 for (size_t k = 5; k < 8; k++) { 627 GemmMicrokernelTester() 628 .mr(4) 629 .nr(8) 630 .kr(1) 631 .sr(1) 632 .m(4) 633 .n(8) 634 .k(k) 635 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 636 } 637 } 638 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_gt_4_strided_a)639 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 640 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 641 for (size_t k = 5; k < 8; k++) { 642 GemmMicrokernelTester() 643 .mr(4) 644 .nr(8) 645 .kr(1) 646 .sr(1) 647 .m(4) 648 .n(8) 649 .k(k) 650 .a_stride(11) 651 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 652 } 653 } 654 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_gt_4_subtile)655 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 656 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 657 for (size_t k = 5; k < 8; k++) { 658 for (uint32_t m = 1; m <= 4; m++) { 659 for (uint32_t n = 1; n <= 8; n++) { 660 GemmMicrokernelTester() 661 .mr(4) 662 .nr(8) 663 .kr(1) 664 .sr(1) 665 .m(m) 666 .n(n) 667 .k(k) 668 .iterations(1) 669 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 670 } 671 } 672 } 673 } 674 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_div_4)675 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4) { 676 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 677 for (size_t k = 8; k <= 40; k += 4) { 678 GemmMicrokernelTester() 679 .mr(4) 680 .nr(8) 681 .kr(1) 682 .sr(1) 683 .m(4) 684 .n(8) 685 .k(k) 686 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 687 } 688 } 689 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_div_4_strided_a)690 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4_strided_a) { 691 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 692 for (size_t k = 8; k <= 40; k += 4) { 693 GemmMicrokernelTester() 694 .mr(4) 695 .nr(8) 696 .kr(1) 697 .sr(1) 698 .m(4) 699 .n(8) 700 .k(k) 701 .a_stride(43) 702 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 703 } 704 } 705 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_div_4_subtile)706 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 707 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 708 for (size_t k = 8; k <= 40; k += 4) { 709 for (uint32_t m = 1; m <= 4; m++) { 710 for (uint32_t n = 1; n <= 8; n++) { 711 GemmMicrokernelTester() 712 .mr(4) 713 .nr(8) 714 .kr(1) 715 .sr(1) 716 .m(m) 717 .n(n) 718 .k(k) 719 .iterations(1) 720 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 721 } 722 } 723 } 724 } 725 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8)726 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8) { 727 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 728 for (uint32_t n = 9; n < 16; n++) { 729 for (size_t k = 1; k <= 20; k += 5) { 730 GemmMicrokernelTester() 731 .mr(4) 732 .nr(8) 733 .kr(1) 734 .sr(1) 735 .m(4) 736 .n(8) 737 .k(k) 738 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 739 } 740 } 741 } 742 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)743 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 744 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 745 for (uint32_t n = 9; n < 16; n++) { 746 for (size_t k = 1; k <= 20; k += 5) { 747 GemmMicrokernelTester() 748 .mr(4) 749 .nr(8) 750 .kr(1) 751 .sr(1) 752 .m(4) 753 .n(8) 754 .k(k) 755 .cn_stride(11) 756 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 757 } 758 } 759 } 760 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_strided_a)761 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) { 762 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 763 for (uint32_t n = 9; n < 16; n++) { 764 for (size_t k = 1; k <= 20; k += 5) { 765 GemmMicrokernelTester() 766 .mr(4) 767 .nr(8) 768 .kr(1) 769 .sr(1) 770 .m(4) 771 .n(n) 772 .k(k) 773 .a_stride(23) 774 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 775 } 776 } 777 } 778 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_subtile)779 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 780 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 781 for (uint32_t n = 9; n < 16; n++) { 782 for (size_t k = 1; k <= 20; k += 5) { 783 for (uint32_t m = 1; m <= 4; m++) { 784 GemmMicrokernelTester() 785 .mr(4) 786 .nr(8) 787 .kr(1) 788 .sr(1) 789 .m(m) 790 .n(n) 791 .k(k) 792 .iterations(1) 793 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 794 } 795 } 796 } 797 } 798 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8)799 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8) { 800 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 801 for (uint32_t n = 16; n <= 24; n += 8) { 802 for (size_t k = 1; k <= 20; k += 5) { 803 GemmMicrokernelTester() 804 .mr(4) 805 .nr(8) 806 .kr(1) 807 .sr(1) 808 .m(4) 809 .n(8) 810 .k(k) 811 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 812 } 813 } 814 } 815 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)816 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 817 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 818 for (uint32_t n = 16; n <= 24; n += 8) { 819 for (size_t k = 1; k <= 20; k += 5) { 820 GemmMicrokernelTester() 821 .mr(4) 822 .nr(8) 823 .kr(1) 824 .sr(1) 825 .m(4) 826 .n(n) 827 .k(k) 828 .cn_stride(11) 829 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 830 } 831 } 832 } 833 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_strided_a)834 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_strided_a) { 835 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 836 for (uint32_t n = 16; n <= 24; n += 8) { 837 for (size_t k = 1; k <= 20; k += 5) { 838 GemmMicrokernelTester() 839 .mr(4) 840 .nr(8) 841 .kr(1) 842 .sr(1) 843 .m(4) 844 .n(n) 845 .k(k) 846 .a_stride(23) 847 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 848 } 849 } 850 } 851 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_subtile)852 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 853 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 854 for (uint32_t n = 16; n <= 24; n += 8) { 855 for (size_t k = 1; k <= 20; k += 5) { 856 for (uint32_t m = 1; m <= 4; m++) { 857 GemmMicrokernelTester() 858 .mr(4) 859 .nr(8) 860 .kr(1) 861 .sr(1) 862 .m(m) 863 .n(n) 864 .k(k) 865 .iterations(1) 866 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 867 } 868 } 869 } 870 } 871 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cm_subtile)872 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 873 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 874 for (size_t k = 1; k <= 20; k += 5) { 875 for (uint32_t m = 1; m <= 4; m++) { 876 for (uint32_t n = 1; n <= 8; n++) { 877 GemmMicrokernelTester() 878 .mr(4) 879 .nr(8) 880 .kr(1) 881 .sr(1) 882 .m(m) 883 .n(n) 884 .k(k) 885 .cm_stride(11) 886 .iterations(1) 887 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 888 } 889 } 890 } 891 } 892 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,qmin)893 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmin) { 894 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 895 GemmMicrokernelTester() 896 .mr(4) 897 .nr(8) 898 .kr(1) 899 .sr(1) 900 .m(4) 901 .n(8) 902 .k(4) 903 .qmin(128) 904 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 905 } 906 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,qmax)907 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmax) { 908 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 909 GemmMicrokernelTester() 910 .mr(4) 911 .nr(8) 912 .kr(1) 913 .sr(1) 914 .m(4) 915 .n(8) 916 .k(4) 917 .qmax(128) 918 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 919 } 920 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cm)921 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm) { 922 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 923 GemmMicrokernelTester() 924 .mr(4) 925 .nr(8) 926 .kr(1) 927 .sr(1) 928 .m(4) 929 .n(8) 930 .k(4) 931 .cm_stride(11) 932 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64); 933 } 934 #endif // XNN_ARCH_ARM64 935 936 937 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4)938 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4) { 939 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 940 GemmMicrokernelTester() 941 .mr(6) 942 .nr(8) 943 .kr(1) 944 .sr(1) 945 .m(6) 946 .n(8) 947 .k(4) 948 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 949 } 950 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cn)951 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cn) { 952 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 953 GemmMicrokernelTester() 954 .mr(6) 955 .nr(8) 956 .kr(1) 957 .sr(1) 958 .m(6) 959 .n(8) 960 .k(4) 961 .cn_stride(11) 962 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 963 } 964 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_strided_a)965 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 966 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 967 GemmMicrokernelTester() 968 .mr(6) 969 .nr(8) 970 .kr(1) 971 .sr(1) 972 .m(6) 973 .n(8) 974 .k(4) 975 .a_stride(7) 976 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 977 } 978 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile)979 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 980 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 981 for (uint32_t m = 1; m <= 6; m++) { 982 for (uint32_t n = 1; n <= 8; n++) { 983 GemmMicrokernelTester() 984 .mr(6) 985 .nr(8) 986 .kr(1) 987 .sr(1) 988 .m(m) 989 .n(n) 990 .k(4) 991 .iterations(1) 992 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 993 } 994 } 995 } 996 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)997 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 998 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 999 for (uint32_t m = 1; m <= 6; m++) { 1000 GemmMicrokernelTester() 1001 .mr(6) 1002 .nr(8) 1003 .kr(1) 1004 .sr(1) 1005 .m(m) 1006 .n(8) 1007 .k(4) 1008 .iterations(1) 1009 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1010 } 1011 } 1012 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)1013 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 1014 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1015 for (uint32_t n = 1; n <= 8; n++) { 1016 GemmMicrokernelTester() 1017 .mr(6) 1018 .nr(8) 1019 .kr(1) 1020 .sr(1) 1021 .m(6) 1022 .n(n) 1023 .k(4) 1024 .iterations(1) 1025 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1026 } 1027 } 1028 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_lt_4)1029 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4) { 1030 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1031 for (size_t k = 1; k < 4; k++) { 1032 GemmMicrokernelTester() 1033 .mr(6) 1034 .nr(8) 1035 .kr(1) 1036 .sr(1) 1037 .m(6) 1038 .n(8) 1039 .k(k) 1040 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1041 } 1042 } 1043 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_lt_4_strided_a)1044 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 1045 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1046 for (size_t k = 1; k < 4; k++) { 1047 GemmMicrokernelTester() 1048 .mr(6) 1049 .nr(8) 1050 .kr(1) 1051 .sr(1) 1052 .m(6) 1053 .n(8) 1054 .k(k) 1055 .a_stride(7) 1056 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1057 } 1058 } 1059 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_lt_4_subtile)1060 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 1061 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1062 for (size_t k = 1; k < 4; k++) { 1063 for (uint32_t m = 1; m <= 6; m++) { 1064 for (uint32_t n = 1; n <= 8; n++) { 1065 GemmMicrokernelTester() 1066 .mr(6) 1067 .nr(8) 1068 .kr(1) 1069 .sr(1) 1070 .m(m) 1071 .n(n) 1072 .k(k) 1073 .iterations(1) 1074 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1075 } 1076 } 1077 } 1078 } 1079 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_gt_4)1080 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4) { 1081 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1082 for (size_t k = 5; k < 8; k++) { 1083 GemmMicrokernelTester() 1084 .mr(6) 1085 .nr(8) 1086 .kr(1) 1087 .sr(1) 1088 .m(6) 1089 .n(8) 1090 .k(k) 1091 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1092 } 1093 } 1094 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_gt_4_strided_a)1095 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 1096 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1097 for (size_t k = 5; k < 8; k++) { 1098 GemmMicrokernelTester() 1099 .mr(6) 1100 .nr(8) 1101 .kr(1) 1102 .sr(1) 1103 .m(6) 1104 .n(8) 1105 .k(k) 1106 .a_stride(11) 1107 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1108 } 1109 } 1110 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_gt_4_subtile)1111 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 1112 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1113 for (size_t k = 5; k < 8; k++) { 1114 for (uint32_t m = 1; m <= 6; m++) { 1115 for (uint32_t n = 1; n <= 8; n++) { 1116 GemmMicrokernelTester() 1117 .mr(6) 1118 .nr(8) 1119 .kr(1) 1120 .sr(1) 1121 .m(m) 1122 .n(n) 1123 .k(k) 1124 .iterations(1) 1125 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1126 } 1127 } 1128 } 1129 } 1130 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_div_4)1131 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4) { 1132 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1133 for (size_t k = 8; k <= 40; k += 4) { 1134 GemmMicrokernelTester() 1135 .mr(6) 1136 .nr(8) 1137 .kr(1) 1138 .sr(1) 1139 .m(6) 1140 .n(8) 1141 .k(k) 1142 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1143 } 1144 } 1145 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_div_4_strided_a)1146 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4_strided_a) { 1147 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1148 for (size_t k = 8; k <= 40; k += 4) { 1149 GemmMicrokernelTester() 1150 .mr(6) 1151 .nr(8) 1152 .kr(1) 1153 .sr(1) 1154 .m(6) 1155 .n(8) 1156 .k(k) 1157 .a_stride(43) 1158 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1159 } 1160 } 1161 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_div_4_subtile)1162 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 1163 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1164 for (size_t k = 8; k <= 40; k += 4) { 1165 for (uint32_t m = 1; m <= 6; m++) { 1166 for (uint32_t n = 1; n <= 8; n++) { 1167 GemmMicrokernelTester() 1168 .mr(6) 1169 .nr(8) 1170 .kr(1) 1171 .sr(1) 1172 .m(m) 1173 .n(n) 1174 .k(k) 1175 .iterations(1) 1176 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1177 } 1178 } 1179 } 1180 } 1181 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8)1182 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8) { 1183 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1184 for (uint32_t n = 9; n < 16; n++) { 1185 for (size_t k = 1; k <= 20; k += 5) { 1186 GemmMicrokernelTester() 1187 .mr(6) 1188 .nr(8) 1189 .kr(1) 1190 .sr(1) 1191 .m(6) 1192 .n(8) 1193 .k(k) 1194 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1195 } 1196 } 1197 } 1198 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)1199 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 1200 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1201 for (uint32_t n = 9; n < 16; n++) { 1202 for (size_t k = 1; k <= 20; k += 5) { 1203 GemmMicrokernelTester() 1204 .mr(6) 1205 .nr(8) 1206 .kr(1) 1207 .sr(1) 1208 .m(6) 1209 .n(8) 1210 .k(k) 1211 .cn_stride(11) 1212 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1213 } 1214 } 1215 } 1216 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_strided_a)1217 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) { 1218 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1219 for (uint32_t n = 9; n < 16; n++) { 1220 for (size_t k = 1; k <= 20; k += 5) { 1221 GemmMicrokernelTester() 1222 .mr(6) 1223 .nr(8) 1224 .kr(1) 1225 .sr(1) 1226 .m(6) 1227 .n(n) 1228 .k(k) 1229 .a_stride(23) 1230 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1231 } 1232 } 1233 } 1234 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_subtile)1235 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 1236 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1237 for (uint32_t n = 9; n < 16; n++) { 1238 for (size_t k = 1; k <= 20; k += 5) { 1239 for (uint32_t m = 1; m <= 6; m++) { 1240 GemmMicrokernelTester() 1241 .mr(6) 1242 .nr(8) 1243 .kr(1) 1244 .sr(1) 1245 .m(m) 1246 .n(n) 1247 .k(k) 1248 .iterations(1) 1249 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1250 } 1251 } 1252 } 1253 } 1254 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8)1255 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8) { 1256 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1257 for (uint32_t n = 16; n <= 24; n += 8) { 1258 for (size_t k = 1; k <= 20; k += 5) { 1259 GemmMicrokernelTester() 1260 .mr(6) 1261 .nr(8) 1262 .kr(1) 1263 .sr(1) 1264 .m(6) 1265 .n(8) 1266 .k(k) 1267 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1268 } 1269 } 1270 } 1271 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)1272 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 1273 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1274 for (uint32_t n = 16; n <= 24; n += 8) { 1275 for (size_t k = 1; k <= 20; k += 5) { 1276 GemmMicrokernelTester() 1277 .mr(6) 1278 .nr(8) 1279 .kr(1) 1280 .sr(1) 1281 .m(6) 1282 .n(n) 1283 .k(k) 1284 .cn_stride(11) 1285 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1286 } 1287 } 1288 } 1289 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_strided_a)1290 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_strided_a) { 1291 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1292 for (uint32_t n = 16; n <= 24; n += 8) { 1293 for (size_t k = 1; k <= 20; k += 5) { 1294 GemmMicrokernelTester() 1295 .mr(6) 1296 .nr(8) 1297 .kr(1) 1298 .sr(1) 1299 .m(6) 1300 .n(n) 1301 .k(k) 1302 .a_stride(23) 1303 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1304 } 1305 } 1306 } 1307 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_subtile)1308 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 1309 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1310 for (uint32_t n = 16; n <= 24; n += 8) { 1311 for (size_t k = 1; k <= 20; k += 5) { 1312 for (uint32_t m = 1; m <= 6; m++) { 1313 GemmMicrokernelTester() 1314 .mr(6) 1315 .nr(8) 1316 .kr(1) 1317 .sr(1) 1318 .m(m) 1319 .n(n) 1320 .k(k) 1321 .iterations(1) 1322 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1323 } 1324 } 1325 } 1326 } 1327 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cm_subtile)1328 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 1329 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1330 for (size_t k = 1; k <= 20; k += 5) { 1331 for (uint32_t m = 1; m <= 6; m++) { 1332 for (uint32_t n = 1; n <= 8; n++) { 1333 GemmMicrokernelTester() 1334 .mr(6) 1335 .nr(8) 1336 .kr(1) 1337 .sr(1) 1338 .m(m) 1339 .n(n) 1340 .k(k) 1341 .cm_stride(11) 1342 .iterations(1) 1343 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1344 } 1345 } 1346 } 1347 } 1348 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,qmin)1349 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmin) { 1350 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1351 GemmMicrokernelTester() 1352 .mr(6) 1353 .nr(8) 1354 .kr(1) 1355 .sr(1) 1356 .m(6) 1357 .n(8) 1358 .k(4) 1359 .qmin(128) 1360 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1361 } 1362 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,qmax)1363 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmax) { 1364 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1365 GemmMicrokernelTester() 1366 .mr(6) 1367 .nr(8) 1368 .kr(1) 1369 .sr(1) 1370 .m(6) 1371 .n(8) 1372 .k(4) 1373 .qmax(128) 1374 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1375 } 1376 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cm)1377 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm) { 1378 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1379 GemmMicrokernelTester() 1380 .mr(6) 1381 .nr(8) 1382 .kr(1) 1383 .sr(1) 1384 .m(6) 1385 .n(8) 1386 .k(4) 1387 .cm_stride(11) 1388 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64); 1389 } 1390 #endif // XNN_ARCH_ARM64 1391 1392 1393 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4)1394 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4) { 1395 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1396 GemmMicrokernelTester() 1397 .mr(8) 1398 .nr(8) 1399 .kr(1) 1400 .sr(1) 1401 .m(8) 1402 .n(8) 1403 .k(4) 1404 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1405 } 1406 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cn)1407 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cn) { 1408 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1409 GemmMicrokernelTester() 1410 .mr(8) 1411 .nr(8) 1412 .kr(1) 1413 .sr(1) 1414 .m(8) 1415 .n(8) 1416 .k(4) 1417 .cn_stride(11) 1418 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1419 } 1420 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_strided_a)1421 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 1422 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1423 GemmMicrokernelTester() 1424 .mr(8) 1425 .nr(8) 1426 .kr(1) 1427 .sr(1) 1428 .m(8) 1429 .n(8) 1430 .k(4) 1431 .a_stride(7) 1432 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1433 } 1434 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile)1435 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 1436 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1437 for (uint32_t m = 1; m <= 8; m++) { 1438 for (uint32_t n = 1; n <= 8; n++) { 1439 GemmMicrokernelTester() 1440 .mr(8) 1441 .nr(8) 1442 .kr(1) 1443 .sr(1) 1444 .m(m) 1445 .n(n) 1446 .k(4) 1447 .iterations(1) 1448 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1449 } 1450 } 1451 } 1452 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)1453 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 1454 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1455 for (uint32_t m = 1; m <= 8; m++) { 1456 GemmMicrokernelTester() 1457 .mr(8) 1458 .nr(8) 1459 .kr(1) 1460 .sr(1) 1461 .m(m) 1462 .n(8) 1463 .k(4) 1464 .iterations(1) 1465 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1466 } 1467 } 1468 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)1469 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 1470 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1471 for (uint32_t n = 1; n <= 8; n++) { 1472 GemmMicrokernelTester() 1473 .mr(8) 1474 .nr(8) 1475 .kr(1) 1476 .sr(1) 1477 .m(8) 1478 .n(n) 1479 .k(4) 1480 .iterations(1) 1481 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1482 } 1483 } 1484 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_lt_4)1485 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4) { 1486 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1487 for (size_t k = 1; k < 4; k++) { 1488 GemmMicrokernelTester() 1489 .mr(8) 1490 .nr(8) 1491 .kr(1) 1492 .sr(1) 1493 .m(8) 1494 .n(8) 1495 .k(k) 1496 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1497 } 1498 } 1499 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_lt_4_strided_a)1500 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 1501 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1502 for (size_t k = 1; k < 4; k++) { 1503 GemmMicrokernelTester() 1504 .mr(8) 1505 .nr(8) 1506 .kr(1) 1507 .sr(1) 1508 .m(8) 1509 .n(8) 1510 .k(k) 1511 .a_stride(7) 1512 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1513 } 1514 } 1515 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_lt_4_subtile)1516 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 1517 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1518 for (size_t k = 1; k < 4; k++) { 1519 for (uint32_t m = 1; m <= 8; m++) { 1520 for (uint32_t n = 1; n <= 8; n++) { 1521 GemmMicrokernelTester() 1522 .mr(8) 1523 .nr(8) 1524 .kr(1) 1525 .sr(1) 1526 .m(m) 1527 .n(n) 1528 .k(k) 1529 .iterations(1) 1530 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1531 } 1532 } 1533 } 1534 } 1535 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_gt_4)1536 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4) { 1537 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1538 for (size_t k = 5; k < 8; k++) { 1539 GemmMicrokernelTester() 1540 .mr(8) 1541 .nr(8) 1542 .kr(1) 1543 .sr(1) 1544 .m(8) 1545 .n(8) 1546 .k(k) 1547 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1548 } 1549 } 1550 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_gt_4_strided_a)1551 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 1552 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1553 for (size_t k = 5; k < 8; k++) { 1554 GemmMicrokernelTester() 1555 .mr(8) 1556 .nr(8) 1557 .kr(1) 1558 .sr(1) 1559 .m(8) 1560 .n(8) 1561 .k(k) 1562 .a_stride(11) 1563 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1564 } 1565 } 1566 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_gt_4_subtile)1567 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 1568 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1569 for (size_t k = 5; k < 8; k++) { 1570 for (uint32_t m = 1; m <= 8; m++) { 1571 for (uint32_t n = 1; n <= 8; n++) { 1572 GemmMicrokernelTester() 1573 .mr(8) 1574 .nr(8) 1575 .kr(1) 1576 .sr(1) 1577 .m(m) 1578 .n(n) 1579 .k(k) 1580 .iterations(1) 1581 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1582 } 1583 } 1584 } 1585 } 1586 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_div_4)1587 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4) { 1588 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1589 for (size_t k = 8; k <= 40; k += 4) { 1590 GemmMicrokernelTester() 1591 .mr(8) 1592 .nr(8) 1593 .kr(1) 1594 .sr(1) 1595 .m(8) 1596 .n(8) 1597 .k(k) 1598 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1599 } 1600 } 1601 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_div_4_strided_a)1602 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4_strided_a) { 1603 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1604 for (size_t k = 8; k <= 40; k += 4) { 1605 GemmMicrokernelTester() 1606 .mr(8) 1607 .nr(8) 1608 .kr(1) 1609 .sr(1) 1610 .m(8) 1611 .n(8) 1612 .k(k) 1613 .a_stride(43) 1614 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1615 } 1616 } 1617 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_div_4_subtile)1618 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 1619 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1620 for (size_t k = 8; k <= 40; k += 4) { 1621 for (uint32_t m = 1; m <= 8; m++) { 1622 for (uint32_t n = 1; n <= 8; n++) { 1623 GemmMicrokernelTester() 1624 .mr(8) 1625 .nr(8) 1626 .kr(1) 1627 .sr(1) 1628 .m(m) 1629 .n(n) 1630 .k(k) 1631 .iterations(1) 1632 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1633 } 1634 } 1635 } 1636 } 1637 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8)1638 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8) { 1639 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1640 for (uint32_t n = 9; n < 16; n++) { 1641 for (size_t k = 1; k <= 20; k += 5) { 1642 GemmMicrokernelTester() 1643 .mr(8) 1644 .nr(8) 1645 .kr(1) 1646 .sr(1) 1647 .m(8) 1648 .n(8) 1649 .k(k) 1650 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1651 } 1652 } 1653 } 1654 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)1655 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 1656 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1657 for (uint32_t n = 9; n < 16; n++) { 1658 for (size_t k = 1; k <= 20; k += 5) { 1659 GemmMicrokernelTester() 1660 .mr(8) 1661 .nr(8) 1662 .kr(1) 1663 .sr(1) 1664 .m(8) 1665 .n(8) 1666 .k(k) 1667 .cn_stride(11) 1668 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1669 } 1670 } 1671 } 1672 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_strided_a)1673 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) { 1674 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1675 for (uint32_t n = 9; n < 16; n++) { 1676 for (size_t k = 1; k <= 20; k += 5) { 1677 GemmMicrokernelTester() 1678 .mr(8) 1679 .nr(8) 1680 .kr(1) 1681 .sr(1) 1682 .m(8) 1683 .n(n) 1684 .k(k) 1685 .a_stride(23) 1686 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1687 } 1688 } 1689 } 1690 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_subtile)1691 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 1692 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1693 for (uint32_t n = 9; n < 16; n++) { 1694 for (size_t k = 1; k <= 20; k += 5) { 1695 for (uint32_t m = 1; m <= 8; m++) { 1696 GemmMicrokernelTester() 1697 .mr(8) 1698 .nr(8) 1699 .kr(1) 1700 .sr(1) 1701 .m(m) 1702 .n(n) 1703 .k(k) 1704 .iterations(1) 1705 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1706 } 1707 } 1708 } 1709 } 1710 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8)1711 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8) { 1712 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1713 for (uint32_t n = 16; n <= 24; n += 8) { 1714 for (size_t k = 1; k <= 20; k += 5) { 1715 GemmMicrokernelTester() 1716 .mr(8) 1717 .nr(8) 1718 .kr(1) 1719 .sr(1) 1720 .m(8) 1721 .n(8) 1722 .k(k) 1723 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1724 } 1725 } 1726 } 1727 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)1728 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 1729 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1730 for (uint32_t n = 16; n <= 24; n += 8) { 1731 for (size_t k = 1; k <= 20; k += 5) { 1732 GemmMicrokernelTester() 1733 .mr(8) 1734 .nr(8) 1735 .kr(1) 1736 .sr(1) 1737 .m(8) 1738 .n(n) 1739 .k(k) 1740 .cn_stride(11) 1741 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1742 } 1743 } 1744 } 1745 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_strided_a)1746 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_strided_a) { 1747 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1748 for (uint32_t n = 16; n <= 24; n += 8) { 1749 for (size_t k = 1; k <= 20; k += 5) { 1750 GemmMicrokernelTester() 1751 .mr(8) 1752 .nr(8) 1753 .kr(1) 1754 .sr(1) 1755 .m(8) 1756 .n(n) 1757 .k(k) 1758 .a_stride(23) 1759 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1760 } 1761 } 1762 } 1763 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_subtile)1764 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 1765 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1766 for (uint32_t n = 16; n <= 24; n += 8) { 1767 for (size_t k = 1; k <= 20; k += 5) { 1768 for (uint32_t m = 1; m <= 8; m++) { 1769 GemmMicrokernelTester() 1770 .mr(8) 1771 .nr(8) 1772 .kr(1) 1773 .sr(1) 1774 .m(m) 1775 .n(n) 1776 .k(k) 1777 .iterations(1) 1778 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1779 } 1780 } 1781 } 1782 } 1783 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cm_subtile)1784 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 1785 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1786 for (size_t k = 1; k <= 20; k += 5) { 1787 for (uint32_t m = 1; m <= 8; m++) { 1788 for (uint32_t n = 1; n <= 8; n++) { 1789 GemmMicrokernelTester() 1790 .mr(8) 1791 .nr(8) 1792 .kr(1) 1793 .sr(1) 1794 .m(m) 1795 .n(n) 1796 .k(k) 1797 .cm_stride(11) 1798 .iterations(1) 1799 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1800 } 1801 } 1802 } 1803 } 1804 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,qmin)1805 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmin) { 1806 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1807 GemmMicrokernelTester() 1808 .mr(8) 1809 .nr(8) 1810 .kr(1) 1811 .sr(1) 1812 .m(8) 1813 .n(8) 1814 .k(4) 1815 .qmin(128) 1816 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1817 } 1818 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,qmax)1819 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmax) { 1820 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1821 GemmMicrokernelTester() 1822 .mr(8) 1823 .nr(8) 1824 .kr(1) 1825 .sr(1) 1826 .m(8) 1827 .n(8) 1828 .k(4) 1829 .qmax(128) 1830 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1831 } 1832 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cm)1833 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm) { 1834 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1835 GemmMicrokernelTester() 1836 .mr(8) 1837 .nr(8) 1838 .kr(1) 1839 .sr(1) 1840 .m(8) 1841 .n(8) 1842 .k(4) 1843 .cm_stride(11) 1844 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64); 1845 } 1846 #endif // XNN_ARCH_ARM64 1847 1848 1849 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4)1850 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4) { 1851 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1852 GemmMicrokernelTester() 1853 .mr(1) 1854 .nr(16) 1855 .kr(1) 1856 .sr(1) 1857 .m(1) 1858 .n(16) 1859 .k(4) 1860 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1861 } 1862 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cn)1863 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cn) { 1864 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1865 GemmMicrokernelTester() 1866 .mr(1) 1867 .nr(16) 1868 .kr(1) 1869 .sr(1) 1870 .m(1) 1871 .n(16) 1872 .k(4) 1873 .cn_stride(19) 1874 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1875 } 1876 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_strided_a)1877 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 1878 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1879 GemmMicrokernelTester() 1880 .mr(1) 1881 .nr(16) 1882 .kr(1) 1883 .sr(1) 1884 .m(1) 1885 .n(16) 1886 .k(4) 1887 .a_stride(7) 1888 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1889 } 1890 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile)1891 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 1892 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1893 for (uint32_t m = 1; m <= 1; m++) { 1894 for (uint32_t n = 1; n <= 16; n++) { 1895 GemmMicrokernelTester() 1896 .mr(1) 1897 .nr(16) 1898 .kr(1) 1899 .sr(1) 1900 .m(m) 1901 .n(n) 1902 .k(4) 1903 .iterations(1) 1904 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1905 } 1906 } 1907 } 1908 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)1909 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 1910 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1911 for (uint32_t m = 1; m <= 1; m++) { 1912 GemmMicrokernelTester() 1913 .mr(1) 1914 .nr(16) 1915 .kr(1) 1916 .sr(1) 1917 .m(m) 1918 .n(16) 1919 .k(4) 1920 .iterations(1) 1921 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1922 } 1923 } 1924 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)1925 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 1926 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1927 for (uint32_t n = 1; n <= 16; n++) { 1928 GemmMicrokernelTester() 1929 .mr(1) 1930 .nr(16) 1931 .kr(1) 1932 .sr(1) 1933 .m(1) 1934 .n(n) 1935 .k(4) 1936 .iterations(1) 1937 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1938 } 1939 } 1940 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_lt_4)1941 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4) { 1942 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1943 for (size_t k = 1; k < 4; k++) { 1944 GemmMicrokernelTester() 1945 .mr(1) 1946 .nr(16) 1947 .kr(1) 1948 .sr(1) 1949 .m(1) 1950 .n(16) 1951 .k(k) 1952 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1953 } 1954 } 1955 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_lt_4_strided_a)1956 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 1957 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1958 for (size_t k = 1; k < 4; k++) { 1959 GemmMicrokernelTester() 1960 .mr(1) 1961 .nr(16) 1962 .kr(1) 1963 .sr(1) 1964 .m(1) 1965 .n(16) 1966 .k(k) 1967 .a_stride(7) 1968 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1969 } 1970 } 1971 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_lt_4_subtile)1972 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 1973 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1974 for (size_t k = 1; k < 4; k++) { 1975 for (uint32_t m = 1; m <= 1; m++) { 1976 for (uint32_t n = 1; n <= 16; n++) { 1977 GemmMicrokernelTester() 1978 .mr(1) 1979 .nr(16) 1980 .kr(1) 1981 .sr(1) 1982 .m(m) 1983 .n(n) 1984 .k(k) 1985 .iterations(1) 1986 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 1987 } 1988 } 1989 } 1990 } 1991 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_gt_4)1992 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4) { 1993 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1994 for (size_t k = 5; k < 8; k++) { 1995 GemmMicrokernelTester() 1996 .mr(1) 1997 .nr(16) 1998 .kr(1) 1999 .sr(1) 2000 .m(1) 2001 .n(16) 2002 .k(k) 2003 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2004 } 2005 } 2006 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_gt_4_strided_a)2007 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 2008 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2009 for (size_t k = 5; k < 8; k++) { 2010 GemmMicrokernelTester() 2011 .mr(1) 2012 .nr(16) 2013 .kr(1) 2014 .sr(1) 2015 .m(1) 2016 .n(16) 2017 .k(k) 2018 .a_stride(11) 2019 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2020 } 2021 } 2022 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_gt_4_subtile)2023 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 2024 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2025 for (size_t k = 5; k < 8; k++) { 2026 for (uint32_t m = 1; m <= 1; m++) { 2027 for (uint32_t n = 1; n <= 16; n++) { 2028 GemmMicrokernelTester() 2029 .mr(1) 2030 .nr(16) 2031 .kr(1) 2032 .sr(1) 2033 .m(m) 2034 .n(n) 2035 .k(k) 2036 .iterations(1) 2037 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2038 } 2039 } 2040 } 2041 } 2042 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_div_4)2043 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4) { 2044 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2045 for (size_t k = 8; k <= 40; k += 4) { 2046 GemmMicrokernelTester() 2047 .mr(1) 2048 .nr(16) 2049 .kr(1) 2050 .sr(1) 2051 .m(1) 2052 .n(16) 2053 .k(k) 2054 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2055 } 2056 } 2057 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_div_4_strided_a)2058 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4_strided_a) { 2059 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2060 for (size_t k = 8; k <= 40; k += 4) { 2061 GemmMicrokernelTester() 2062 .mr(1) 2063 .nr(16) 2064 .kr(1) 2065 .sr(1) 2066 .m(1) 2067 .n(16) 2068 .k(k) 2069 .a_stride(43) 2070 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2071 } 2072 } 2073 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_div_4_subtile)2074 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 2075 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2076 for (size_t k = 8; k <= 40; k += 4) { 2077 for (uint32_t m = 1; m <= 1; m++) { 2078 for (uint32_t n = 1; n <= 16; n++) { 2079 GemmMicrokernelTester() 2080 .mr(1) 2081 .nr(16) 2082 .kr(1) 2083 .sr(1) 2084 .m(m) 2085 .n(n) 2086 .k(k) 2087 .iterations(1) 2088 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2089 } 2090 } 2091 } 2092 } 2093 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16)2094 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16) { 2095 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2096 for (uint32_t n = 17; n < 32; n++) { 2097 for (size_t k = 1; k <= 20; k += 5) { 2098 GemmMicrokernelTester() 2099 .mr(1) 2100 .nr(16) 2101 .kr(1) 2102 .sr(1) 2103 .m(1) 2104 .n(16) 2105 .k(k) 2106 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2107 } 2108 } 2109 } 2110 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)2111 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 2112 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2113 for (uint32_t n = 17; n < 32; n++) { 2114 for (size_t k = 1; k <= 20; k += 5) { 2115 GemmMicrokernelTester() 2116 .mr(1) 2117 .nr(16) 2118 .kr(1) 2119 .sr(1) 2120 .m(1) 2121 .n(16) 2122 .k(k) 2123 .cn_stride(19) 2124 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2125 } 2126 } 2127 } 2128 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_strided_a)2129 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) { 2130 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2131 for (uint32_t n = 17; n < 32; n++) { 2132 for (size_t k = 1; k <= 20; k += 5) { 2133 GemmMicrokernelTester() 2134 .mr(1) 2135 .nr(16) 2136 .kr(1) 2137 .sr(1) 2138 .m(1) 2139 .n(n) 2140 .k(k) 2141 .a_stride(23) 2142 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2143 } 2144 } 2145 } 2146 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_subtile)2147 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 2148 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2149 for (uint32_t n = 17; n < 32; n++) { 2150 for (size_t k = 1; k <= 20; k += 5) { 2151 for (uint32_t m = 1; m <= 1; m++) { 2152 GemmMicrokernelTester() 2153 .mr(1) 2154 .nr(16) 2155 .kr(1) 2156 .sr(1) 2157 .m(m) 2158 .n(n) 2159 .k(k) 2160 .iterations(1) 2161 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2162 } 2163 } 2164 } 2165 } 2166 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16)2167 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16) { 2168 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2169 for (uint32_t n = 32; n <= 48; n += 16) { 2170 for (size_t k = 1; k <= 20; k += 5) { 2171 GemmMicrokernelTester() 2172 .mr(1) 2173 .nr(16) 2174 .kr(1) 2175 .sr(1) 2176 .m(1) 2177 .n(16) 2178 .k(k) 2179 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2180 } 2181 } 2182 } 2183 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)2184 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 2185 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2186 for (uint32_t n = 32; n <= 48; n += 16) { 2187 for (size_t k = 1; k <= 20; k += 5) { 2188 GemmMicrokernelTester() 2189 .mr(1) 2190 .nr(16) 2191 .kr(1) 2192 .sr(1) 2193 .m(1) 2194 .n(n) 2195 .k(k) 2196 .cn_stride(19) 2197 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2198 } 2199 } 2200 } 2201 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_strided_a)2202 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_strided_a) { 2203 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2204 for (uint32_t n = 32; n <= 48; n += 16) { 2205 for (size_t k = 1; k <= 20; k += 5) { 2206 GemmMicrokernelTester() 2207 .mr(1) 2208 .nr(16) 2209 .kr(1) 2210 .sr(1) 2211 .m(1) 2212 .n(n) 2213 .k(k) 2214 .a_stride(23) 2215 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2216 } 2217 } 2218 } 2219 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_subtile)2220 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 2221 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2222 for (uint32_t n = 32; n <= 48; n += 16) { 2223 for (size_t k = 1; k <= 20; k += 5) { 2224 for (uint32_t m = 1; m <= 1; m++) { 2225 GemmMicrokernelTester() 2226 .mr(1) 2227 .nr(16) 2228 .kr(1) 2229 .sr(1) 2230 .m(m) 2231 .n(n) 2232 .k(k) 2233 .iterations(1) 2234 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2235 } 2236 } 2237 } 2238 } 2239 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cm_subtile)2240 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 2241 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2242 for (size_t k = 1; k <= 20; k += 5) { 2243 for (uint32_t m = 1; m <= 1; m++) { 2244 for (uint32_t n = 1; n <= 16; n++) { 2245 GemmMicrokernelTester() 2246 .mr(1) 2247 .nr(16) 2248 .kr(1) 2249 .sr(1) 2250 .m(m) 2251 .n(n) 2252 .k(k) 2253 .cm_stride(19) 2254 .iterations(1) 2255 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2256 } 2257 } 2258 } 2259 } 2260 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,qmin)2261 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmin) { 2262 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2263 GemmMicrokernelTester() 2264 .mr(1) 2265 .nr(16) 2266 .kr(1) 2267 .sr(1) 2268 .m(1) 2269 .n(16) 2270 .k(4) 2271 .qmin(128) 2272 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2273 } 2274 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,qmax)2275 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmax) { 2276 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2277 GemmMicrokernelTester() 2278 .mr(1) 2279 .nr(16) 2280 .kr(1) 2281 .sr(1) 2282 .m(1) 2283 .n(16) 2284 .k(4) 2285 .qmax(128) 2286 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2287 } 2288 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cm)2289 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm) { 2290 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2291 GemmMicrokernelTester() 2292 .mr(1) 2293 .nr(16) 2294 .kr(1) 2295 .sr(1) 2296 .m(1) 2297 .n(16) 2298 .k(4) 2299 .cm_stride(19) 2300 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64); 2301 } 2302 #endif // XNN_ARCH_ARM64 2303 2304 2305 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4)2306 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4) { 2307 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2308 GemmMicrokernelTester() 2309 .mr(4) 2310 .nr(16) 2311 .kr(1) 2312 .sr(1) 2313 .m(4) 2314 .n(16) 2315 .k(4) 2316 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2317 } 2318 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cn)2319 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cn) { 2320 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2321 GemmMicrokernelTester() 2322 .mr(4) 2323 .nr(16) 2324 .kr(1) 2325 .sr(1) 2326 .m(4) 2327 .n(16) 2328 .k(4) 2329 .cn_stride(19) 2330 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2331 } 2332 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_strided_a)2333 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 2334 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2335 GemmMicrokernelTester() 2336 .mr(4) 2337 .nr(16) 2338 .kr(1) 2339 .sr(1) 2340 .m(4) 2341 .n(16) 2342 .k(4) 2343 .a_stride(7) 2344 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2345 } 2346 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile)2347 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 2348 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2349 for (uint32_t m = 1; m <= 4; m++) { 2350 for (uint32_t n = 1; n <= 16; n++) { 2351 GemmMicrokernelTester() 2352 .mr(4) 2353 .nr(16) 2354 .kr(1) 2355 .sr(1) 2356 .m(m) 2357 .n(n) 2358 .k(4) 2359 .iterations(1) 2360 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2361 } 2362 } 2363 } 2364 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)2365 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 2366 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2367 for (uint32_t m = 1; m <= 4; m++) { 2368 GemmMicrokernelTester() 2369 .mr(4) 2370 .nr(16) 2371 .kr(1) 2372 .sr(1) 2373 .m(m) 2374 .n(16) 2375 .k(4) 2376 .iterations(1) 2377 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2378 } 2379 } 2380 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)2381 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 2382 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2383 for (uint32_t n = 1; n <= 16; n++) { 2384 GemmMicrokernelTester() 2385 .mr(4) 2386 .nr(16) 2387 .kr(1) 2388 .sr(1) 2389 .m(4) 2390 .n(n) 2391 .k(4) 2392 .iterations(1) 2393 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2394 } 2395 } 2396 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_lt_4)2397 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4) { 2398 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2399 for (size_t k = 1; k < 4; k++) { 2400 GemmMicrokernelTester() 2401 .mr(4) 2402 .nr(16) 2403 .kr(1) 2404 .sr(1) 2405 .m(4) 2406 .n(16) 2407 .k(k) 2408 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2409 } 2410 } 2411 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_lt_4_strided_a)2412 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 2413 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2414 for (size_t k = 1; k < 4; k++) { 2415 GemmMicrokernelTester() 2416 .mr(4) 2417 .nr(16) 2418 .kr(1) 2419 .sr(1) 2420 .m(4) 2421 .n(16) 2422 .k(k) 2423 .a_stride(7) 2424 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2425 } 2426 } 2427 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_lt_4_subtile)2428 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 2429 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2430 for (size_t k = 1; k < 4; k++) { 2431 for (uint32_t m = 1; m <= 4; m++) { 2432 for (uint32_t n = 1; n <= 16; n++) { 2433 GemmMicrokernelTester() 2434 .mr(4) 2435 .nr(16) 2436 .kr(1) 2437 .sr(1) 2438 .m(m) 2439 .n(n) 2440 .k(k) 2441 .iterations(1) 2442 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2443 } 2444 } 2445 } 2446 } 2447 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_gt_4)2448 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4) { 2449 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2450 for (size_t k = 5; k < 8; k++) { 2451 GemmMicrokernelTester() 2452 .mr(4) 2453 .nr(16) 2454 .kr(1) 2455 .sr(1) 2456 .m(4) 2457 .n(16) 2458 .k(k) 2459 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2460 } 2461 } 2462 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_gt_4_strided_a)2463 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 2464 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2465 for (size_t k = 5; k < 8; k++) { 2466 GemmMicrokernelTester() 2467 .mr(4) 2468 .nr(16) 2469 .kr(1) 2470 .sr(1) 2471 .m(4) 2472 .n(16) 2473 .k(k) 2474 .a_stride(11) 2475 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2476 } 2477 } 2478 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_gt_4_subtile)2479 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 2480 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2481 for (size_t k = 5; k < 8; k++) { 2482 for (uint32_t m = 1; m <= 4; m++) { 2483 for (uint32_t n = 1; n <= 16; n++) { 2484 GemmMicrokernelTester() 2485 .mr(4) 2486 .nr(16) 2487 .kr(1) 2488 .sr(1) 2489 .m(m) 2490 .n(n) 2491 .k(k) 2492 .iterations(1) 2493 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2494 } 2495 } 2496 } 2497 } 2498 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_div_4)2499 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4) { 2500 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2501 for (size_t k = 8; k <= 40; k += 4) { 2502 GemmMicrokernelTester() 2503 .mr(4) 2504 .nr(16) 2505 .kr(1) 2506 .sr(1) 2507 .m(4) 2508 .n(16) 2509 .k(k) 2510 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2511 } 2512 } 2513 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_div_4_strided_a)2514 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4_strided_a) { 2515 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2516 for (size_t k = 8; k <= 40; k += 4) { 2517 GemmMicrokernelTester() 2518 .mr(4) 2519 .nr(16) 2520 .kr(1) 2521 .sr(1) 2522 .m(4) 2523 .n(16) 2524 .k(k) 2525 .a_stride(43) 2526 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2527 } 2528 } 2529 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_div_4_subtile)2530 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 2531 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2532 for (size_t k = 8; k <= 40; k += 4) { 2533 for (uint32_t m = 1; m <= 4; m++) { 2534 for (uint32_t n = 1; n <= 16; n++) { 2535 GemmMicrokernelTester() 2536 .mr(4) 2537 .nr(16) 2538 .kr(1) 2539 .sr(1) 2540 .m(m) 2541 .n(n) 2542 .k(k) 2543 .iterations(1) 2544 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2545 } 2546 } 2547 } 2548 } 2549 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16)2550 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16) { 2551 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2552 for (uint32_t n = 17; n < 32; n++) { 2553 for (size_t k = 1; k <= 20; k += 5) { 2554 GemmMicrokernelTester() 2555 .mr(4) 2556 .nr(16) 2557 .kr(1) 2558 .sr(1) 2559 .m(4) 2560 .n(16) 2561 .k(k) 2562 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2563 } 2564 } 2565 } 2566 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)2567 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 2568 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2569 for (uint32_t n = 17; n < 32; n++) { 2570 for (size_t k = 1; k <= 20; k += 5) { 2571 GemmMicrokernelTester() 2572 .mr(4) 2573 .nr(16) 2574 .kr(1) 2575 .sr(1) 2576 .m(4) 2577 .n(16) 2578 .k(k) 2579 .cn_stride(19) 2580 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2581 } 2582 } 2583 } 2584 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_strided_a)2585 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) { 2586 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2587 for (uint32_t n = 17; n < 32; n++) { 2588 for (size_t k = 1; k <= 20; k += 5) { 2589 GemmMicrokernelTester() 2590 .mr(4) 2591 .nr(16) 2592 .kr(1) 2593 .sr(1) 2594 .m(4) 2595 .n(n) 2596 .k(k) 2597 .a_stride(23) 2598 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2599 } 2600 } 2601 } 2602 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_subtile)2603 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 2604 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2605 for (uint32_t n = 17; n < 32; n++) { 2606 for (size_t k = 1; k <= 20; k += 5) { 2607 for (uint32_t m = 1; m <= 4; m++) { 2608 GemmMicrokernelTester() 2609 .mr(4) 2610 .nr(16) 2611 .kr(1) 2612 .sr(1) 2613 .m(m) 2614 .n(n) 2615 .k(k) 2616 .iterations(1) 2617 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2618 } 2619 } 2620 } 2621 } 2622 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16)2623 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16) { 2624 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2625 for (uint32_t n = 32; n <= 48; n += 16) { 2626 for (size_t k = 1; k <= 20; k += 5) { 2627 GemmMicrokernelTester() 2628 .mr(4) 2629 .nr(16) 2630 .kr(1) 2631 .sr(1) 2632 .m(4) 2633 .n(16) 2634 .k(k) 2635 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2636 } 2637 } 2638 } 2639 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)2640 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 2641 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2642 for (uint32_t n = 32; n <= 48; n += 16) { 2643 for (size_t k = 1; k <= 20; k += 5) { 2644 GemmMicrokernelTester() 2645 .mr(4) 2646 .nr(16) 2647 .kr(1) 2648 .sr(1) 2649 .m(4) 2650 .n(n) 2651 .k(k) 2652 .cn_stride(19) 2653 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2654 } 2655 } 2656 } 2657 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_strided_a)2658 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_strided_a) { 2659 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2660 for (uint32_t n = 32; n <= 48; n += 16) { 2661 for (size_t k = 1; k <= 20; k += 5) { 2662 GemmMicrokernelTester() 2663 .mr(4) 2664 .nr(16) 2665 .kr(1) 2666 .sr(1) 2667 .m(4) 2668 .n(n) 2669 .k(k) 2670 .a_stride(23) 2671 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2672 } 2673 } 2674 } 2675 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_subtile)2676 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 2677 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2678 for (uint32_t n = 32; n <= 48; n += 16) { 2679 for (size_t k = 1; k <= 20; k += 5) { 2680 for (uint32_t m = 1; m <= 4; m++) { 2681 GemmMicrokernelTester() 2682 .mr(4) 2683 .nr(16) 2684 .kr(1) 2685 .sr(1) 2686 .m(m) 2687 .n(n) 2688 .k(k) 2689 .iterations(1) 2690 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2691 } 2692 } 2693 } 2694 } 2695 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cm_subtile)2696 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 2697 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2698 for (size_t k = 1; k <= 20; k += 5) { 2699 for (uint32_t m = 1; m <= 4; m++) { 2700 for (uint32_t n = 1; n <= 16; n++) { 2701 GemmMicrokernelTester() 2702 .mr(4) 2703 .nr(16) 2704 .kr(1) 2705 .sr(1) 2706 .m(m) 2707 .n(n) 2708 .k(k) 2709 .cm_stride(19) 2710 .iterations(1) 2711 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2712 } 2713 } 2714 } 2715 } 2716 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,qmin)2717 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmin) { 2718 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2719 GemmMicrokernelTester() 2720 .mr(4) 2721 .nr(16) 2722 .kr(1) 2723 .sr(1) 2724 .m(4) 2725 .n(16) 2726 .k(4) 2727 .qmin(128) 2728 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2729 } 2730 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,qmax)2731 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmax) { 2732 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2733 GemmMicrokernelTester() 2734 .mr(4) 2735 .nr(16) 2736 .kr(1) 2737 .sr(1) 2738 .m(4) 2739 .n(16) 2740 .k(4) 2741 .qmax(128) 2742 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2743 } 2744 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cm)2745 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm) { 2746 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2747 GemmMicrokernelTester() 2748 .mr(4) 2749 .nr(16) 2750 .kr(1) 2751 .sr(1) 2752 .m(4) 2753 .n(16) 2754 .k(4) 2755 .cm_stride(19) 2756 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64); 2757 } 2758 #endif // XNN_ARCH_ARM64 2759 2760 2761 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4)2762 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4) { 2763 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2764 GemmMicrokernelTester() 2765 .mr(6) 2766 .nr(16) 2767 .kr(1) 2768 .sr(1) 2769 .m(6) 2770 .n(16) 2771 .k(4) 2772 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2773 } 2774 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cn)2775 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cn) { 2776 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2777 GemmMicrokernelTester() 2778 .mr(6) 2779 .nr(16) 2780 .kr(1) 2781 .sr(1) 2782 .m(6) 2783 .n(16) 2784 .k(4) 2785 .cn_stride(19) 2786 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2787 } 2788 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_strided_a)2789 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 2790 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2791 GemmMicrokernelTester() 2792 .mr(6) 2793 .nr(16) 2794 .kr(1) 2795 .sr(1) 2796 .m(6) 2797 .n(16) 2798 .k(4) 2799 .a_stride(7) 2800 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2801 } 2802 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile)2803 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 2804 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2805 for (uint32_t m = 1; m <= 6; m++) { 2806 for (uint32_t n = 1; n <= 16; n++) { 2807 GemmMicrokernelTester() 2808 .mr(6) 2809 .nr(16) 2810 .kr(1) 2811 .sr(1) 2812 .m(m) 2813 .n(n) 2814 .k(4) 2815 .iterations(1) 2816 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2817 } 2818 } 2819 } 2820 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)2821 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 2822 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2823 for (uint32_t m = 1; m <= 6; m++) { 2824 GemmMicrokernelTester() 2825 .mr(6) 2826 .nr(16) 2827 .kr(1) 2828 .sr(1) 2829 .m(m) 2830 .n(16) 2831 .k(4) 2832 .iterations(1) 2833 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2834 } 2835 } 2836 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)2837 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 2838 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2839 for (uint32_t n = 1; n <= 16; n++) { 2840 GemmMicrokernelTester() 2841 .mr(6) 2842 .nr(16) 2843 .kr(1) 2844 .sr(1) 2845 .m(6) 2846 .n(n) 2847 .k(4) 2848 .iterations(1) 2849 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2850 } 2851 } 2852 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_lt_4)2853 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4) { 2854 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2855 for (size_t k = 1; k < 4; k++) { 2856 GemmMicrokernelTester() 2857 .mr(6) 2858 .nr(16) 2859 .kr(1) 2860 .sr(1) 2861 .m(6) 2862 .n(16) 2863 .k(k) 2864 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2865 } 2866 } 2867 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_lt_4_strided_a)2868 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 2869 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2870 for (size_t k = 1; k < 4; k++) { 2871 GemmMicrokernelTester() 2872 .mr(6) 2873 .nr(16) 2874 .kr(1) 2875 .sr(1) 2876 .m(6) 2877 .n(16) 2878 .k(k) 2879 .a_stride(7) 2880 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2881 } 2882 } 2883 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_lt_4_subtile)2884 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 2885 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2886 for (size_t k = 1; k < 4; k++) { 2887 for (uint32_t m = 1; m <= 6; m++) { 2888 for (uint32_t n = 1; n <= 16; n++) { 2889 GemmMicrokernelTester() 2890 .mr(6) 2891 .nr(16) 2892 .kr(1) 2893 .sr(1) 2894 .m(m) 2895 .n(n) 2896 .k(k) 2897 .iterations(1) 2898 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2899 } 2900 } 2901 } 2902 } 2903 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_gt_4)2904 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4) { 2905 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2906 for (size_t k = 5; k < 8; k++) { 2907 GemmMicrokernelTester() 2908 .mr(6) 2909 .nr(16) 2910 .kr(1) 2911 .sr(1) 2912 .m(6) 2913 .n(16) 2914 .k(k) 2915 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2916 } 2917 } 2918 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_gt_4_strided_a)2919 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 2920 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2921 for (size_t k = 5; k < 8; k++) { 2922 GemmMicrokernelTester() 2923 .mr(6) 2924 .nr(16) 2925 .kr(1) 2926 .sr(1) 2927 .m(6) 2928 .n(16) 2929 .k(k) 2930 .a_stride(11) 2931 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2932 } 2933 } 2934 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_gt_4_subtile)2935 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 2936 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2937 for (size_t k = 5; k < 8; k++) { 2938 for (uint32_t m = 1; m <= 6; m++) { 2939 for (uint32_t n = 1; n <= 16; n++) { 2940 GemmMicrokernelTester() 2941 .mr(6) 2942 .nr(16) 2943 .kr(1) 2944 .sr(1) 2945 .m(m) 2946 .n(n) 2947 .k(k) 2948 .iterations(1) 2949 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2950 } 2951 } 2952 } 2953 } 2954 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_div_4)2955 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4) { 2956 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2957 for (size_t k = 8; k <= 40; k += 4) { 2958 GemmMicrokernelTester() 2959 .mr(6) 2960 .nr(16) 2961 .kr(1) 2962 .sr(1) 2963 .m(6) 2964 .n(16) 2965 .k(k) 2966 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2967 } 2968 } 2969 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_div_4_strided_a)2970 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4_strided_a) { 2971 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2972 for (size_t k = 8; k <= 40; k += 4) { 2973 GemmMicrokernelTester() 2974 .mr(6) 2975 .nr(16) 2976 .kr(1) 2977 .sr(1) 2978 .m(6) 2979 .n(16) 2980 .k(k) 2981 .a_stride(43) 2982 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 2983 } 2984 } 2985 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_div_4_subtile)2986 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 2987 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2988 for (size_t k = 8; k <= 40; k += 4) { 2989 for (uint32_t m = 1; m <= 6; m++) { 2990 for (uint32_t n = 1; n <= 16; n++) { 2991 GemmMicrokernelTester() 2992 .mr(6) 2993 .nr(16) 2994 .kr(1) 2995 .sr(1) 2996 .m(m) 2997 .n(n) 2998 .k(k) 2999 .iterations(1) 3000 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3001 } 3002 } 3003 } 3004 } 3005 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16)3006 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16) { 3007 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3008 for (uint32_t n = 17; n < 32; n++) { 3009 for (size_t k = 1; k <= 20; k += 5) { 3010 GemmMicrokernelTester() 3011 .mr(6) 3012 .nr(16) 3013 .kr(1) 3014 .sr(1) 3015 .m(6) 3016 .n(16) 3017 .k(k) 3018 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3019 } 3020 } 3021 } 3022 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)3023 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 3024 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3025 for (uint32_t n = 17; n < 32; n++) { 3026 for (size_t k = 1; k <= 20; k += 5) { 3027 GemmMicrokernelTester() 3028 .mr(6) 3029 .nr(16) 3030 .kr(1) 3031 .sr(1) 3032 .m(6) 3033 .n(16) 3034 .k(k) 3035 .cn_stride(19) 3036 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3037 } 3038 } 3039 } 3040 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_strided_a)3041 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) { 3042 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3043 for (uint32_t n = 17; n < 32; n++) { 3044 for (size_t k = 1; k <= 20; k += 5) { 3045 GemmMicrokernelTester() 3046 .mr(6) 3047 .nr(16) 3048 .kr(1) 3049 .sr(1) 3050 .m(6) 3051 .n(n) 3052 .k(k) 3053 .a_stride(23) 3054 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3055 } 3056 } 3057 } 3058 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_subtile)3059 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 3060 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3061 for (uint32_t n = 17; n < 32; n++) { 3062 for (size_t k = 1; k <= 20; k += 5) { 3063 for (uint32_t m = 1; m <= 6; m++) { 3064 GemmMicrokernelTester() 3065 .mr(6) 3066 .nr(16) 3067 .kr(1) 3068 .sr(1) 3069 .m(m) 3070 .n(n) 3071 .k(k) 3072 .iterations(1) 3073 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3074 } 3075 } 3076 } 3077 } 3078 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16)3079 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16) { 3080 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3081 for (uint32_t n = 32; n <= 48; n += 16) { 3082 for (size_t k = 1; k <= 20; k += 5) { 3083 GemmMicrokernelTester() 3084 .mr(6) 3085 .nr(16) 3086 .kr(1) 3087 .sr(1) 3088 .m(6) 3089 .n(16) 3090 .k(k) 3091 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3092 } 3093 } 3094 } 3095 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)3096 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 3097 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3098 for (uint32_t n = 32; n <= 48; n += 16) { 3099 for (size_t k = 1; k <= 20; k += 5) { 3100 GemmMicrokernelTester() 3101 .mr(6) 3102 .nr(16) 3103 .kr(1) 3104 .sr(1) 3105 .m(6) 3106 .n(n) 3107 .k(k) 3108 .cn_stride(19) 3109 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3110 } 3111 } 3112 } 3113 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_strided_a)3114 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_strided_a) { 3115 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3116 for (uint32_t n = 32; n <= 48; n += 16) { 3117 for (size_t k = 1; k <= 20; k += 5) { 3118 GemmMicrokernelTester() 3119 .mr(6) 3120 .nr(16) 3121 .kr(1) 3122 .sr(1) 3123 .m(6) 3124 .n(n) 3125 .k(k) 3126 .a_stride(23) 3127 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3128 } 3129 } 3130 } 3131 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_subtile)3132 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 3133 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3134 for (uint32_t n = 32; n <= 48; n += 16) { 3135 for (size_t k = 1; k <= 20; k += 5) { 3136 for (uint32_t m = 1; m <= 6; m++) { 3137 GemmMicrokernelTester() 3138 .mr(6) 3139 .nr(16) 3140 .kr(1) 3141 .sr(1) 3142 .m(m) 3143 .n(n) 3144 .k(k) 3145 .iterations(1) 3146 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3147 } 3148 } 3149 } 3150 } 3151 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cm_subtile)3152 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 3153 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3154 for (size_t k = 1; k <= 20; k += 5) { 3155 for (uint32_t m = 1; m <= 6; m++) { 3156 for (uint32_t n = 1; n <= 16; n++) { 3157 GemmMicrokernelTester() 3158 .mr(6) 3159 .nr(16) 3160 .kr(1) 3161 .sr(1) 3162 .m(m) 3163 .n(n) 3164 .k(k) 3165 .cm_stride(19) 3166 .iterations(1) 3167 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3168 } 3169 } 3170 } 3171 } 3172 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,qmin)3173 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmin) { 3174 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3175 GemmMicrokernelTester() 3176 .mr(6) 3177 .nr(16) 3178 .kr(1) 3179 .sr(1) 3180 .m(6) 3181 .n(16) 3182 .k(4) 3183 .qmin(128) 3184 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3185 } 3186 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,qmax)3187 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmax) { 3188 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3189 GemmMicrokernelTester() 3190 .mr(6) 3191 .nr(16) 3192 .kr(1) 3193 .sr(1) 3194 .m(6) 3195 .n(16) 3196 .k(4) 3197 .qmax(128) 3198 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3199 } 3200 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cm)3201 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm) { 3202 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3203 GemmMicrokernelTester() 3204 .mr(6) 3205 .nr(16) 3206 .kr(1) 3207 .sr(1) 3208 .m(6) 3209 .n(16) 3210 .k(4) 3211 .cm_stride(19) 3212 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64); 3213 } 3214 #endif // XNN_ARCH_ARM64 3215 3216 3217 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4)3218 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4) { 3219 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3220 GemmMicrokernelTester() 3221 .mr(8) 3222 .nr(16) 3223 .kr(1) 3224 .sr(1) 3225 .m(8) 3226 .n(16) 3227 .k(4) 3228 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3229 } 3230 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cn)3231 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cn) { 3232 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3233 GemmMicrokernelTester() 3234 .mr(8) 3235 .nr(16) 3236 .kr(1) 3237 .sr(1) 3238 .m(8) 3239 .n(16) 3240 .k(4) 3241 .cn_stride(19) 3242 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3243 } 3244 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_strided_a)3245 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) { 3246 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3247 GemmMicrokernelTester() 3248 .mr(8) 3249 .nr(16) 3250 .kr(1) 3251 .sr(1) 3252 .m(8) 3253 .n(16) 3254 .k(4) 3255 .a_stride(7) 3256 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3257 } 3258 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile)3259 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 3260 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3261 for (uint32_t m = 1; m <= 8; m++) { 3262 for (uint32_t n = 1; n <= 16; n++) { 3263 GemmMicrokernelTester() 3264 .mr(8) 3265 .nr(16) 3266 .kr(1) 3267 .sr(1) 3268 .m(m) 3269 .n(n) 3270 .k(4) 3271 .iterations(1) 3272 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3273 } 3274 } 3275 } 3276 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)3277 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 3278 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3279 for (uint32_t m = 1; m <= 8; m++) { 3280 GemmMicrokernelTester() 3281 .mr(8) 3282 .nr(16) 3283 .kr(1) 3284 .sr(1) 3285 .m(m) 3286 .n(16) 3287 .k(4) 3288 .iterations(1) 3289 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3290 } 3291 } 3292 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)3293 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 3294 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3295 for (uint32_t n = 1; n <= 16; n++) { 3296 GemmMicrokernelTester() 3297 .mr(8) 3298 .nr(16) 3299 .kr(1) 3300 .sr(1) 3301 .m(8) 3302 .n(n) 3303 .k(4) 3304 .iterations(1) 3305 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3306 } 3307 } 3308 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_lt_4)3309 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4) { 3310 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3311 for (size_t k = 1; k < 4; k++) { 3312 GemmMicrokernelTester() 3313 .mr(8) 3314 .nr(16) 3315 .kr(1) 3316 .sr(1) 3317 .m(8) 3318 .n(16) 3319 .k(k) 3320 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3321 } 3322 } 3323 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_lt_4_strided_a)3324 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) { 3325 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3326 for (size_t k = 1; k < 4; k++) { 3327 GemmMicrokernelTester() 3328 .mr(8) 3329 .nr(16) 3330 .kr(1) 3331 .sr(1) 3332 .m(8) 3333 .n(16) 3334 .k(k) 3335 .a_stride(7) 3336 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3337 } 3338 } 3339 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_lt_4_subtile)3340 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 3341 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3342 for (size_t k = 1; k < 4; k++) { 3343 for (uint32_t m = 1; m <= 8; m++) { 3344 for (uint32_t n = 1; n <= 16; n++) { 3345 GemmMicrokernelTester() 3346 .mr(8) 3347 .nr(16) 3348 .kr(1) 3349 .sr(1) 3350 .m(m) 3351 .n(n) 3352 .k(k) 3353 .iterations(1) 3354 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3355 } 3356 } 3357 } 3358 } 3359 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_gt_4)3360 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4) { 3361 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3362 for (size_t k = 5; k < 8; k++) { 3363 GemmMicrokernelTester() 3364 .mr(8) 3365 .nr(16) 3366 .kr(1) 3367 .sr(1) 3368 .m(8) 3369 .n(16) 3370 .k(k) 3371 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3372 } 3373 } 3374 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_gt_4_strided_a)3375 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) { 3376 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3377 for (size_t k = 5; k < 8; k++) { 3378 GemmMicrokernelTester() 3379 .mr(8) 3380 .nr(16) 3381 .kr(1) 3382 .sr(1) 3383 .m(8) 3384 .n(16) 3385 .k(k) 3386 .a_stride(11) 3387 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3388 } 3389 } 3390 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_gt_4_subtile)3391 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 3392 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3393 for (size_t k = 5; k < 8; k++) { 3394 for (uint32_t m = 1; m <= 8; m++) { 3395 for (uint32_t n = 1; n <= 16; n++) { 3396 GemmMicrokernelTester() 3397 .mr(8) 3398 .nr(16) 3399 .kr(1) 3400 .sr(1) 3401 .m(m) 3402 .n(n) 3403 .k(k) 3404 .iterations(1) 3405 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3406 } 3407 } 3408 } 3409 } 3410 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_div_4)3411 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4) { 3412 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3413 for (size_t k = 8; k <= 40; k += 4) { 3414 GemmMicrokernelTester() 3415 .mr(8) 3416 .nr(16) 3417 .kr(1) 3418 .sr(1) 3419 .m(8) 3420 .n(16) 3421 .k(k) 3422 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3423 } 3424 } 3425 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_div_4_strided_a)3426 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4_strided_a) { 3427 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3428 for (size_t k = 8; k <= 40; k += 4) { 3429 GemmMicrokernelTester() 3430 .mr(8) 3431 .nr(16) 3432 .kr(1) 3433 .sr(1) 3434 .m(8) 3435 .n(16) 3436 .k(k) 3437 .a_stride(43) 3438 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3439 } 3440 } 3441 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_div_4_subtile)3442 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 3443 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3444 for (size_t k = 8; k <= 40; k += 4) { 3445 for (uint32_t m = 1; m <= 8; m++) { 3446 for (uint32_t n = 1; n <= 16; n++) { 3447 GemmMicrokernelTester() 3448 .mr(8) 3449 .nr(16) 3450 .kr(1) 3451 .sr(1) 3452 .m(m) 3453 .n(n) 3454 .k(k) 3455 .iterations(1) 3456 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3457 } 3458 } 3459 } 3460 } 3461 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16)3462 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16) { 3463 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3464 for (uint32_t n = 17; n < 32; n++) { 3465 for (size_t k = 1; k <= 20; k += 5) { 3466 GemmMicrokernelTester() 3467 .mr(8) 3468 .nr(16) 3469 .kr(1) 3470 .sr(1) 3471 .m(8) 3472 .n(16) 3473 .k(k) 3474 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3475 } 3476 } 3477 } 3478 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)3479 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 3480 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3481 for (uint32_t n = 17; n < 32; n++) { 3482 for (size_t k = 1; k <= 20; k += 5) { 3483 GemmMicrokernelTester() 3484 .mr(8) 3485 .nr(16) 3486 .kr(1) 3487 .sr(1) 3488 .m(8) 3489 .n(16) 3490 .k(k) 3491 .cn_stride(19) 3492 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3493 } 3494 } 3495 } 3496 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_strided_a)3497 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) { 3498 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3499 for (uint32_t n = 17; n < 32; n++) { 3500 for (size_t k = 1; k <= 20; k += 5) { 3501 GemmMicrokernelTester() 3502 .mr(8) 3503 .nr(16) 3504 .kr(1) 3505 .sr(1) 3506 .m(8) 3507 .n(n) 3508 .k(k) 3509 .a_stride(23) 3510 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3511 } 3512 } 3513 } 3514 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_subtile)3515 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 3516 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3517 for (uint32_t n = 17; n < 32; n++) { 3518 for (size_t k = 1; k <= 20; k += 5) { 3519 for (uint32_t m = 1; m <= 8; m++) { 3520 GemmMicrokernelTester() 3521 .mr(8) 3522 .nr(16) 3523 .kr(1) 3524 .sr(1) 3525 .m(m) 3526 .n(n) 3527 .k(k) 3528 .iterations(1) 3529 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3530 } 3531 } 3532 } 3533 } 3534 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16)3535 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16) { 3536 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3537 for (uint32_t n = 32; n <= 48; n += 16) { 3538 for (size_t k = 1; k <= 20; k += 5) { 3539 GemmMicrokernelTester() 3540 .mr(8) 3541 .nr(16) 3542 .kr(1) 3543 .sr(1) 3544 .m(8) 3545 .n(16) 3546 .k(k) 3547 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3548 } 3549 } 3550 } 3551 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)3552 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 3553 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3554 for (uint32_t n = 32; n <= 48; n += 16) { 3555 for (size_t k = 1; k <= 20; k += 5) { 3556 GemmMicrokernelTester() 3557 .mr(8) 3558 .nr(16) 3559 .kr(1) 3560 .sr(1) 3561 .m(8) 3562 .n(n) 3563 .k(k) 3564 .cn_stride(19) 3565 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3566 } 3567 } 3568 } 3569 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_strided_a)3570 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_strided_a) { 3571 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3572 for (uint32_t n = 32; n <= 48; n += 16) { 3573 for (size_t k = 1; k <= 20; k += 5) { 3574 GemmMicrokernelTester() 3575 .mr(8) 3576 .nr(16) 3577 .kr(1) 3578 .sr(1) 3579 .m(8) 3580 .n(n) 3581 .k(k) 3582 .a_stride(23) 3583 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3584 } 3585 } 3586 } 3587 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_subtile)3588 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 3589 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3590 for (uint32_t n = 32; n <= 48; n += 16) { 3591 for (size_t k = 1; k <= 20; k += 5) { 3592 for (uint32_t m = 1; m <= 8; m++) { 3593 GemmMicrokernelTester() 3594 .mr(8) 3595 .nr(16) 3596 .kr(1) 3597 .sr(1) 3598 .m(m) 3599 .n(n) 3600 .k(k) 3601 .iterations(1) 3602 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3603 } 3604 } 3605 } 3606 } 3607 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cm_subtile)3608 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 3609 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3610 for (size_t k = 1; k <= 20; k += 5) { 3611 for (uint32_t m = 1; m <= 8; m++) { 3612 for (uint32_t n = 1; n <= 16; n++) { 3613 GemmMicrokernelTester() 3614 .mr(8) 3615 .nr(16) 3616 .kr(1) 3617 .sr(1) 3618 .m(m) 3619 .n(n) 3620 .k(k) 3621 .cm_stride(19) 3622 .iterations(1) 3623 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3624 } 3625 } 3626 } 3627 } 3628 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,qmin)3629 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmin) { 3630 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3631 GemmMicrokernelTester() 3632 .mr(8) 3633 .nr(16) 3634 .kr(1) 3635 .sr(1) 3636 .m(8) 3637 .n(16) 3638 .k(4) 3639 .qmin(128) 3640 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3641 } 3642 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,qmax)3643 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmax) { 3644 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3645 GemmMicrokernelTester() 3646 .mr(8) 3647 .nr(16) 3648 .kr(1) 3649 .sr(1) 3650 .m(8) 3651 .n(16) 3652 .k(4) 3653 .qmax(128) 3654 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3655 } 3656 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cm)3657 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm) { 3658 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3659 GemmMicrokernelTester() 3660 .mr(8) 3661 .nr(16) 3662 .kr(1) 3663 .sr(1) 3664 .m(8) 3665 .n(16) 3666 .k(4) 3667 .cm_stride(19) 3668 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64); 3669 } 3670 #endif // XNN_ARCH_ARM64 3671 3672 3673 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)3674 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 3675 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3676 GemmMicrokernelTester() 3677 .mr(1) 3678 .nr(16) 3679 .kr(1) 3680 .sr(1) 3681 .m(1) 3682 .n(16) 3683 .k(2) 3684 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3685 } 3686 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)3687 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 3688 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3689 GemmMicrokernelTester() 3690 .mr(1) 3691 .nr(16) 3692 .kr(1) 3693 .sr(1) 3694 .m(1) 3695 .n(16) 3696 .k(2) 3697 .cn_stride(19) 3698 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3699 } 3700 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_strided_a)3701 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) { 3702 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3703 GemmMicrokernelTester() 3704 .mr(1) 3705 .nr(16) 3706 .kr(1) 3707 .sr(1) 3708 .m(1) 3709 .n(16) 3710 .k(2) 3711 .a_stride(5) 3712 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3713 } 3714 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)3715 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 3716 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3717 for (uint32_t m = 1; m <= 1; m++) { 3718 for (uint32_t n = 1; n <= 16; n++) { 3719 GemmMicrokernelTester() 3720 .mr(1) 3721 .nr(16) 3722 .kr(1) 3723 .sr(1) 3724 .m(m) 3725 .n(n) 3726 .k(2) 3727 .iterations(1) 3728 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3729 } 3730 } 3731 } 3732 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)3733 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 3734 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3735 for (uint32_t m = 1; m <= 1; m++) { 3736 GemmMicrokernelTester() 3737 .mr(1) 3738 .nr(16) 3739 .kr(1) 3740 .sr(1) 3741 .m(m) 3742 .n(16) 3743 .k(2) 3744 .iterations(1) 3745 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3746 } 3747 } 3748 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)3749 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 3750 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3751 for (uint32_t n = 1; n <= 16; n++) { 3752 GemmMicrokernelTester() 3753 .mr(1) 3754 .nr(16) 3755 .kr(1) 3756 .sr(1) 3757 .m(1) 3758 .n(n) 3759 .k(2) 3760 .iterations(1) 3761 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3762 } 3763 } 3764 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)3765 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 3766 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3767 for (size_t k = 1; k < 2; k++) { 3768 GemmMicrokernelTester() 3769 .mr(1) 3770 .nr(16) 3771 .kr(1) 3772 .sr(1) 3773 .m(1) 3774 .n(16) 3775 .k(k) 3776 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3777 } 3778 } 3779 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_strided_a)3780 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) { 3781 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3782 for (size_t k = 1; k < 2; k++) { 3783 GemmMicrokernelTester() 3784 .mr(1) 3785 .nr(16) 3786 .kr(1) 3787 .sr(1) 3788 .m(1) 3789 .n(16) 3790 .k(k) 3791 .a_stride(5) 3792 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3793 } 3794 } 3795 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)3796 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 3797 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3798 for (size_t k = 1; k < 2; k++) { 3799 for (uint32_t m = 1; m <= 1; m++) { 3800 for (uint32_t n = 1; n <= 16; n++) { 3801 GemmMicrokernelTester() 3802 .mr(1) 3803 .nr(16) 3804 .kr(1) 3805 .sr(1) 3806 .m(m) 3807 .n(n) 3808 .k(k) 3809 .iterations(1) 3810 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3811 } 3812 } 3813 } 3814 } 3815 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)3816 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 3817 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3818 for (size_t k = 3; k < 4; k++) { 3819 GemmMicrokernelTester() 3820 .mr(1) 3821 .nr(16) 3822 .kr(1) 3823 .sr(1) 3824 .m(1) 3825 .n(16) 3826 .k(k) 3827 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3828 } 3829 } 3830 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_strided_a)3831 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) { 3832 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3833 for (size_t k = 3; k < 4; k++) { 3834 GemmMicrokernelTester() 3835 .mr(1) 3836 .nr(16) 3837 .kr(1) 3838 .sr(1) 3839 .m(1) 3840 .n(16) 3841 .k(k) 3842 .a_stride(7) 3843 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3844 } 3845 } 3846 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)3847 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 3848 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3849 for (size_t k = 3; k < 4; k++) { 3850 for (uint32_t m = 1; m <= 1; m++) { 3851 for (uint32_t n = 1; n <= 16; n++) { 3852 GemmMicrokernelTester() 3853 .mr(1) 3854 .nr(16) 3855 .kr(1) 3856 .sr(1) 3857 .m(m) 3858 .n(n) 3859 .k(k) 3860 .iterations(1) 3861 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3862 } 3863 } 3864 } 3865 } 3866 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)3867 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 3868 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3869 for (size_t k = 4; k <= 20; k += 2) { 3870 GemmMicrokernelTester() 3871 .mr(1) 3872 .nr(16) 3873 .kr(1) 3874 .sr(1) 3875 .m(1) 3876 .n(16) 3877 .k(k) 3878 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3879 } 3880 } 3881 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_strided_a)3882 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) { 3883 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3884 for (size_t k = 4; k <= 20; k += 2) { 3885 GemmMicrokernelTester() 3886 .mr(1) 3887 .nr(16) 3888 .kr(1) 3889 .sr(1) 3890 .m(1) 3891 .n(16) 3892 .k(k) 3893 .a_stride(23) 3894 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3895 } 3896 } 3897 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)3898 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 3899 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3900 for (size_t k = 4; k <= 20; k += 2) { 3901 for (uint32_t m = 1; m <= 1; m++) { 3902 for (uint32_t n = 1; n <= 16; n++) { 3903 GemmMicrokernelTester() 3904 .mr(1) 3905 .nr(16) 3906 .kr(1) 3907 .sr(1) 3908 .m(m) 3909 .n(n) 3910 .k(k) 3911 .iterations(1) 3912 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3913 } 3914 } 3915 } 3916 } 3917 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)3918 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 3919 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3920 for (uint32_t n = 17; n < 32; n++) { 3921 for (size_t k = 1; k <= 10; k += 3) { 3922 GemmMicrokernelTester() 3923 .mr(1) 3924 .nr(16) 3925 .kr(1) 3926 .sr(1) 3927 .m(1) 3928 .n(16) 3929 .k(k) 3930 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3931 } 3932 } 3933 } 3934 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)3935 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 3936 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3937 for (uint32_t n = 17; n < 32; n++) { 3938 for (size_t k = 1; k <= 10; k += 3) { 3939 GemmMicrokernelTester() 3940 .mr(1) 3941 .nr(16) 3942 .kr(1) 3943 .sr(1) 3944 .m(1) 3945 .n(16) 3946 .k(k) 3947 .cn_stride(19) 3948 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3949 } 3950 } 3951 } 3952 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_a)3953 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) { 3954 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3955 for (uint32_t n = 17; n < 32; n++) { 3956 for (size_t k = 1; k <= 10; k += 3) { 3957 GemmMicrokernelTester() 3958 .mr(1) 3959 .nr(16) 3960 .kr(1) 3961 .sr(1) 3962 .m(1) 3963 .n(n) 3964 .k(k) 3965 .a_stride(13) 3966 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3967 } 3968 } 3969 } 3970 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)3971 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 3972 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3973 for (uint32_t n = 17; n < 32; n++) { 3974 for (size_t k = 1; k <= 10; k += 3) { 3975 for (uint32_t m = 1; m <= 1; m++) { 3976 GemmMicrokernelTester() 3977 .mr(1) 3978 .nr(16) 3979 .kr(1) 3980 .sr(1) 3981 .m(m) 3982 .n(n) 3983 .k(k) 3984 .iterations(1) 3985 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 3986 } 3987 } 3988 } 3989 } 3990 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)3991 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 3992 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3993 for (uint32_t n = 32; n <= 48; n += 16) { 3994 for (size_t k = 1; k <= 10; k += 3) { 3995 GemmMicrokernelTester() 3996 .mr(1) 3997 .nr(16) 3998 .kr(1) 3999 .sr(1) 4000 .m(1) 4001 .n(16) 4002 .k(k) 4003 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 4004 } 4005 } 4006 } 4007 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)4008 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 4009 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4010 for (uint32_t n = 32; n <= 48; n += 16) { 4011 for (size_t k = 1; k <= 10; k += 3) { 4012 GemmMicrokernelTester() 4013 .mr(1) 4014 .nr(16) 4015 .kr(1) 4016 .sr(1) 4017 .m(1) 4018 .n(n) 4019 .k(k) 4020 .cn_stride(19) 4021 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 4022 } 4023 } 4024 } 4025 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_a)4026 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) { 4027 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4028 for (uint32_t n = 32; n <= 48; n += 16) { 4029 for (size_t k = 1; k <= 10; k += 3) { 4030 GemmMicrokernelTester() 4031 .mr(1) 4032 .nr(16) 4033 .kr(1) 4034 .sr(1) 4035 .m(1) 4036 .n(n) 4037 .k(k) 4038 .a_stride(13) 4039 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 4040 } 4041 } 4042 } 4043 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)4044 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 4045 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4046 for (uint32_t n = 32; n <= 48; n += 16) { 4047 for (size_t k = 1; k <= 10; k += 3) { 4048 for (uint32_t m = 1; m <= 1; m++) { 4049 GemmMicrokernelTester() 4050 .mr(1) 4051 .nr(16) 4052 .kr(1) 4053 .sr(1) 4054 .m(m) 4055 .n(n) 4056 .k(k) 4057 .iterations(1) 4058 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 4059 } 4060 } 4061 } 4062 } 4063 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)4064 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 4065 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4066 for (size_t k = 1; k <= 10; k += 3) { 4067 for (uint32_t m = 1; m <= 1; m++) { 4068 for (uint32_t n = 1; n <= 16; n++) { 4069 GemmMicrokernelTester() 4070 .mr(1) 4071 .nr(16) 4072 .kr(1) 4073 .sr(1) 4074 .m(m) 4075 .n(n) 4076 .k(k) 4077 .cm_stride(19) 4078 .iterations(1) 4079 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 4080 } 4081 } 4082 } 4083 } 4084 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,qmin)4085 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 4086 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4087 GemmMicrokernelTester() 4088 .mr(1) 4089 .nr(16) 4090 .kr(1) 4091 .sr(1) 4092 .m(1) 4093 .n(16) 4094 .k(2) 4095 .qmin(128) 4096 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 4097 } 4098 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,qmax)4099 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 4100 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4101 GemmMicrokernelTester() 4102 .mr(1) 4103 .nr(16) 4104 .kr(1) 4105 .sr(1) 4106 .m(1) 4107 .n(16) 4108 .k(2) 4109 .qmax(128) 4110 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 4111 } 4112 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)4113 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 4114 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4115 GemmMicrokernelTester() 4116 .mr(1) 4117 .nr(16) 4118 .kr(1) 4119 .sr(1) 4120 .m(1) 4121 .n(16) 4122 .k(2) 4123 .cm_stride(19) 4124 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32); 4125 } 4126 #endif // XNN_ARCH_ARM64 4127 4128 4129 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)4130 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 4131 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4132 GemmMicrokernelTester() 4133 .mr(4) 4134 .nr(16) 4135 .kr(1) 4136 .sr(1) 4137 .m(4) 4138 .n(16) 4139 .k(2) 4140 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4141 } 4142 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)4143 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 4144 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4145 GemmMicrokernelTester() 4146 .mr(4) 4147 .nr(16) 4148 .kr(1) 4149 .sr(1) 4150 .m(4) 4151 .n(16) 4152 .k(2) 4153 .cn_stride(19) 4154 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4155 } 4156 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_strided_a)4157 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) { 4158 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4159 GemmMicrokernelTester() 4160 .mr(4) 4161 .nr(16) 4162 .kr(1) 4163 .sr(1) 4164 .m(4) 4165 .n(16) 4166 .k(2) 4167 .a_stride(5) 4168 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4169 } 4170 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)4171 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 4172 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4173 for (uint32_t m = 1; m <= 4; m++) { 4174 for (uint32_t n = 1; n <= 16; n++) { 4175 GemmMicrokernelTester() 4176 .mr(4) 4177 .nr(16) 4178 .kr(1) 4179 .sr(1) 4180 .m(m) 4181 .n(n) 4182 .k(2) 4183 .iterations(1) 4184 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4185 } 4186 } 4187 } 4188 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)4189 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 4190 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4191 for (uint32_t m = 1; m <= 4; m++) { 4192 GemmMicrokernelTester() 4193 .mr(4) 4194 .nr(16) 4195 .kr(1) 4196 .sr(1) 4197 .m(m) 4198 .n(16) 4199 .k(2) 4200 .iterations(1) 4201 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4202 } 4203 } 4204 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)4205 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 4206 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4207 for (uint32_t n = 1; n <= 16; n++) { 4208 GemmMicrokernelTester() 4209 .mr(4) 4210 .nr(16) 4211 .kr(1) 4212 .sr(1) 4213 .m(4) 4214 .n(n) 4215 .k(2) 4216 .iterations(1) 4217 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4218 } 4219 } 4220 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)4221 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 4222 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4223 for (size_t k = 1; k < 2; k++) { 4224 GemmMicrokernelTester() 4225 .mr(4) 4226 .nr(16) 4227 .kr(1) 4228 .sr(1) 4229 .m(4) 4230 .n(16) 4231 .k(k) 4232 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4233 } 4234 } 4235 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_strided_a)4236 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) { 4237 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4238 for (size_t k = 1; k < 2; k++) { 4239 GemmMicrokernelTester() 4240 .mr(4) 4241 .nr(16) 4242 .kr(1) 4243 .sr(1) 4244 .m(4) 4245 .n(16) 4246 .k(k) 4247 .a_stride(5) 4248 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4249 } 4250 } 4251 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)4252 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 4253 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4254 for (size_t k = 1; k < 2; k++) { 4255 for (uint32_t m = 1; m <= 4; m++) { 4256 for (uint32_t n = 1; n <= 16; n++) { 4257 GemmMicrokernelTester() 4258 .mr(4) 4259 .nr(16) 4260 .kr(1) 4261 .sr(1) 4262 .m(m) 4263 .n(n) 4264 .k(k) 4265 .iterations(1) 4266 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4267 } 4268 } 4269 } 4270 } 4271 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)4272 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 4273 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4274 for (size_t k = 3; k < 4; k++) { 4275 GemmMicrokernelTester() 4276 .mr(4) 4277 .nr(16) 4278 .kr(1) 4279 .sr(1) 4280 .m(4) 4281 .n(16) 4282 .k(k) 4283 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4284 } 4285 } 4286 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_strided_a)4287 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) { 4288 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4289 for (size_t k = 3; k < 4; k++) { 4290 GemmMicrokernelTester() 4291 .mr(4) 4292 .nr(16) 4293 .kr(1) 4294 .sr(1) 4295 .m(4) 4296 .n(16) 4297 .k(k) 4298 .a_stride(7) 4299 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4300 } 4301 } 4302 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)4303 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 4304 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4305 for (size_t k = 3; k < 4; k++) { 4306 for (uint32_t m = 1; m <= 4; m++) { 4307 for (uint32_t n = 1; n <= 16; n++) { 4308 GemmMicrokernelTester() 4309 .mr(4) 4310 .nr(16) 4311 .kr(1) 4312 .sr(1) 4313 .m(m) 4314 .n(n) 4315 .k(k) 4316 .iterations(1) 4317 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4318 } 4319 } 4320 } 4321 } 4322 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)4323 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 4324 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4325 for (size_t k = 4; k <= 20; k += 2) { 4326 GemmMicrokernelTester() 4327 .mr(4) 4328 .nr(16) 4329 .kr(1) 4330 .sr(1) 4331 .m(4) 4332 .n(16) 4333 .k(k) 4334 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4335 } 4336 } 4337 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_strided_a)4338 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) { 4339 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4340 for (size_t k = 4; k <= 20; k += 2) { 4341 GemmMicrokernelTester() 4342 .mr(4) 4343 .nr(16) 4344 .kr(1) 4345 .sr(1) 4346 .m(4) 4347 .n(16) 4348 .k(k) 4349 .a_stride(23) 4350 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4351 } 4352 } 4353 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)4354 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 4355 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4356 for (size_t k = 4; k <= 20; k += 2) { 4357 for (uint32_t m = 1; m <= 4; m++) { 4358 for (uint32_t n = 1; n <= 16; n++) { 4359 GemmMicrokernelTester() 4360 .mr(4) 4361 .nr(16) 4362 .kr(1) 4363 .sr(1) 4364 .m(m) 4365 .n(n) 4366 .k(k) 4367 .iterations(1) 4368 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4369 } 4370 } 4371 } 4372 } 4373 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)4374 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 4375 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4376 for (uint32_t n = 17; n < 32; n++) { 4377 for (size_t k = 1; k <= 10; k += 3) { 4378 GemmMicrokernelTester() 4379 .mr(4) 4380 .nr(16) 4381 .kr(1) 4382 .sr(1) 4383 .m(4) 4384 .n(16) 4385 .k(k) 4386 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4387 } 4388 } 4389 } 4390 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)4391 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 4392 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4393 for (uint32_t n = 17; n < 32; n++) { 4394 for (size_t k = 1; k <= 10; k += 3) { 4395 GemmMicrokernelTester() 4396 .mr(4) 4397 .nr(16) 4398 .kr(1) 4399 .sr(1) 4400 .m(4) 4401 .n(16) 4402 .k(k) 4403 .cn_stride(19) 4404 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4405 } 4406 } 4407 } 4408 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_a)4409 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) { 4410 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4411 for (uint32_t n = 17; n < 32; n++) { 4412 for (size_t k = 1; k <= 10; k += 3) { 4413 GemmMicrokernelTester() 4414 .mr(4) 4415 .nr(16) 4416 .kr(1) 4417 .sr(1) 4418 .m(4) 4419 .n(n) 4420 .k(k) 4421 .a_stride(13) 4422 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4423 } 4424 } 4425 } 4426 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)4427 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 4428 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4429 for (uint32_t n = 17; n < 32; n++) { 4430 for (size_t k = 1; k <= 10; k += 3) { 4431 for (uint32_t m = 1; m <= 4; m++) { 4432 GemmMicrokernelTester() 4433 .mr(4) 4434 .nr(16) 4435 .kr(1) 4436 .sr(1) 4437 .m(m) 4438 .n(n) 4439 .k(k) 4440 .iterations(1) 4441 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4442 } 4443 } 4444 } 4445 } 4446 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)4447 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 4448 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4449 for (uint32_t n = 32; n <= 48; n += 16) { 4450 for (size_t k = 1; k <= 10; k += 3) { 4451 GemmMicrokernelTester() 4452 .mr(4) 4453 .nr(16) 4454 .kr(1) 4455 .sr(1) 4456 .m(4) 4457 .n(16) 4458 .k(k) 4459 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4460 } 4461 } 4462 } 4463 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)4464 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 4465 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4466 for (uint32_t n = 32; n <= 48; n += 16) { 4467 for (size_t k = 1; k <= 10; k += 3) { 4468 GemmMicrokernelTester() 4469 .mr(4) 4470 .nr(16) 4471 .kr(1) 4472 .sr(1) 4473 .m(4) 4474 .n(n) 4475 .k(k) 4476 .cn_stride(19) 4477 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4478 } 4479 } 4480 } 4481 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_a)4482 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) { 4483 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4484 for (uint32_t n = 32; n <= 48; n += 16) { 4485 for (size_t k = 1; k <= 10; k += 3) { 4486 GemmMicrokernelTester() 4487 .mr(4) 4488 .nr(16) 4489 .kr(1) 4490 .sr(1) 4491 .m(4) 4492 .n(n) 4493 .k(k) 4494 .a_stride(13) 4495 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4496 } 4497 } 4498 } 4499 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)4500 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 4501 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4502 for (uint32_t n = 32; n <= 48; n += 16) { 4503 for (size_t k = 1; k <= 10; k += 3) { 4504 for (uint32_t m = 1; m <= 4; m++) { 4505 GemmMicrokernelTester() 4506 .mr(4) 4507 .nr(16) 4508 .kr(1) 4509 .sr(1) 4510 .m(m) 4511 .n(n) 4512 .k(k) 4513 .iterations(1) 4514 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4515 } 4516 } 4517 } 4518 } 4519 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)4520 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 4521 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4522 for (size_t k = 1; k <= 10; k += 3) { 4523 for (uint32_t m = 1; m <= 4; m++) { 4524 for (uint32_t n = 1; n <= 16; n++) { 4525 GemmMicrokernelTester() 4526 .mr(4) 4527 .nr(16) 4528 .kr(1) 4529 .sr(1) 4530 .m(m) 4531 .n(n) 4532 .k(k) 4533 .cm_stride(19) 4534 .iterations(1) 4535 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4536 } 4537 } 4538 } 4539 } 4540 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,qmin)4541 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 4542 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4543 GemmMicrokernelTester() 4544 .mr(4) 4545 .nr(16) 4546 .kr(1) 4547 .sr(1) 4548 .m(4) 4549 .n(16) 4550 .k(2) 4551 .qmin(128) 4552 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4553 } 4554 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,qmax)4555 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 4556 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4557 GemmMicrokernelTester() 4558 .mr(4) 4559 .nr(16) 4560 .kr(1) 4561 .sr(1) 4562 .m(4) 4563 .n(16) 4564 .k(2) 4565 .qmax(128) 4566 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4567 } 4568 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)4569 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 4570 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4571 GemmMicrokernelTester() 4572 .mr(4) 4573 .nr(16) 4574 .kr(1) 4575 .sr(1) 4576 .m(4) 4577 .n(16) 4578 .k(2) 4579 .cm_stride(19) 4580 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32); 4581 } 4582 #endif // XNN_ARCH_ARM64 4583 4584 4585 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)4586 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 4587 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4588 GemmMicrokernelTester() 4589 .mr(6) 4590 .nr(16) 4591 .kr(1) 4592 .sr(1) 4593 .m(6) 4594 .n(16) 4595 .k(2) 4596 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4597 } 4598 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)4599 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 4600 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4601 GemmMicrokernelTester() 4602 .mr(6) 4603 .nr(16) 4604 .kr(1) 4605 .sr(1) 4606 .m(6) 4607 .n(16) 4608 .k(2) 4609 .cn_stride(19) 4610 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4611 } 4612 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_strided_a)4613 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) { 4614 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4615 GemmMicrokernelTester() 4616 .mr(6) 4617 .nr(16) 4618 .kr(1) 4619 .sr(1) 4620 .m(6) 4621 .n(16) 4622 .k(2) 4623 .a_stride(5) 4624 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4625 } 4626 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)4627 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 4628 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4629 for (uint32_t m = 1; m <= 6; m++) { 4630 for (uint32_t n = 1; n <= 16; n++) { 4631 GemmMicrokernelTester() 4632 .mr(6) 4633 .nr(16) 4634 .kr(1) 4635 .sr(1) 4636 .m(m) 4637 .n(n) 4638 .k(2) 4639 .iterations(1) 4640 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4641 } 4642 } 4643 } 4644 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)4645 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 4646 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4647 for (uint32_t m = 1; m <= 6; m++) { 4648 GemmMicrokernelTester() 4649 .mr(6) 4650 .nr(16) 4651 .kr(1) 4652 .sr(1) 4653 .m(m) 4654 .n(16) 4655 .k(2) 4656 .iterations(1) 4657 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4658 } 4659 } 4660 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)4661 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 4662 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4663 for (uint32_t n = 1; n <= 16; n++) { 4664 GemmMicrokernelTester() 4665 .mr(6) 4666 .nr(16) 4667 .kr(1) 4668 .sr(1) 4669 .m(6) 4670 .n(n) 4671 .k(2) 4672 .iterations(1) 4673 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4674 } 4675 } 4676 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)4677 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 4678 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4679 for (size_t k = 1; k < 2; k++) { 4680 GemmMicrokernelTester() 4681 .mr(6) 4682 .nr(16) 4683 .kr(1) 4684 .sr(1) 4685 .m(6) 4686 .n(16) 4687 .k(k) 4688 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4689 } 4690 } 4691 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_strided_a)4692 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) { 4693 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4694 for (size_t k = 1; k < 2; k++) { 4695 GemmMicrokernelTester() 4696 .mr(6) 4697 .nr(16) 4698 .kr(1) 4699 .sr(1) 4700 .m(6) 4701 .n(16) 4702 .k(k) 4703 .a_stride(5) 4704 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4705 } 4706 } 4707 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)4708 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 4709 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4710 for (size_t k = 1; k < 2; k++) { 4711 for (uint32_t m = 1; m <= 6; m++) { 4712 for (uint32_t n = 1; n <= 16; n++) { 4713 GemmMicrokernelTester() 4714 .mr(6) 4715 .nr(16) 4716 .kr(1) 4717 .sr(1) 4718 .m(m) 4719 .n(n) 4720 .k(k) 4721 .iterations(1) 4722 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4723 } 4724 } 4725 } 4726 } 4727 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)4728 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 4729 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4730 for (size_t k = 3; k < 4; k++) { 4731 GemmMicrokernelTester() 4732 .mr(6) 4733 .nr(16) 4734 .kr(1) 4735 .sr(1) 4736 .m(6) 4737 .n(16) 4738 .k(k) 4739 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4740 } 4741 } 4742 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_strided_a)4743 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) { 4744 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4745 for (size_t k = 3; k < 4; k++) { 4746 GemmMicrokernelTester() 4747 .mr(6) 4748 .nr(16) 4749 .kr(1) 4750 .sr(1) 4751 .m(6) 4752 .n(16) 4753 .k(k) 4754 .a_stride(7) 4755 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4756 } 4757 } 4758 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)4759 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 4760 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4761 for (size_t k = 3; k < 4; k++) { 4762 for (uint32_t m = 1; m <= 6; m++) { 4763 for (uint32_t n = 1; n <= 16; n++) { 4764 GemmMicrokernelTester() 4765 .mr(6) 4766 .nr(16) 4767 .kr(1) 4768 .sr(1) 4769 .m(m) 4770 .n(n) 4771 .k(k) 4772 .iterations(1) 4773 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4774 } 4775 } 4776 } 4777 } 4778 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)4779 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 4780 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4781 for (size_t k = 4; k <= 20; k += 2) { 4782 GemmMicrokernelTester() 4783 .mr(6) 4784 .nr(16) 4785 .kr(1) 4786 .sr(1) 4787 .m(6) 4788 .n(16) 4789 .k(k) 4790 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4791 } 4792 } 4793 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_strided_a)4794 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) { 4795 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4796 for (size_t k = 4; k <= 20; k += 2) { 4797 GemmMicrokernelTester() 4798 .mr(6) 4799 .nr(16) 4800 .kr(1) 4801 .sr(1) 4802 .m(6) 4803 .n(16) 4804 .k(k) 4805 .a_stride(23) 4806 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4807 } 4808 } 4809 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)4810 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 4811 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4812 for (size_t k = 4; k <= 20; k += 2) { 4813 for (uint32_t m = 1; m <= 6; m++) { 4814 for (uint32_t n = 1; n <= 16; n++) { 4815 GemmMicrokernelTester() 4816 .mr(6) 4817 .nr(16) 4818 .kr(1) 4819 .sr(1) 4820 .m(m) 4821 .n(n) 4822 .k(k) 4823 .iterations(1) 4824 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4825 } 4826 } 4827 } 4828 } 4829 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)4830 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 4831 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4832 for (uint32_t n = 17; n < 32; n++) { 4833 for (size_t k = 1; k <= 10; k += 3) { 4834 GemmMicrokernelTester() 4835 .mr(6) 4836 .nr(16) 4837 .kr(1) 4838 .sr(1) 4839 .m(6) 4840 .n(16) 4841 .k(k) 4842 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4843 } 4844 } 4845 } 4846 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)4847 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 4848 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4849 for (uint32_t n = 17; n < 32; n++) { 4850 for (size_t k = 1; k <= 10; k += 3) { 4851 GemmMicrokernelTester() 4852 .mr(6) 4853 .nr(16) 4854 .kr(1) 4855 .sr(1) 4856 .m(6) 4857 .n(16) 4858 .k(k) 4859 .cn_stride(19) 4860 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4861 } 4862 } 4863 } 4864 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_a)4865 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) { 4866 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4867 for (uint32_t n = 17; n < 32; n++) { 4868 for (size_t k = 1; k <= 10; k += 3) { 4869 GemmMicrokernelTester() 4870 .mr(6) 4871 .nr(16) 4872 .kr(1) 4873 .sr(1) 4874 .m(6) 4875 .n(n) 4876 .k(k) 4877 .a_stride(13) 4878 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4879 } 4880 } 4881 } 4882 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)4883 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 4884 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4885 for (uint32_t n = 17; n < 32; n++) { 4886 for (size_t k = 1; k <= 10; k += 3) { 4887 for (uint32_t m = 1; m <= 6; m++) { 4888 GemmMicrokernelTester() 4889 .mr(6) 4890 .nr(16) 4891 .kr(1) 4892 .sr(1) 4893 .m(m) 4894 .n(n) 4895 .k(k) 4896 .iterations(1) 4897 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4898 } 4899 } 4900 } 4901 } 4902 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)4903 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 4904 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4905 for (uint32_t n = 32; n <= 48; n += 16) { 4906 for (size_t k = 1; k <= 10; k += 3) { 4907 GemmMicrokernelTester() 4908 .mr(6) 4909 .nr(16) 4910 .kr(1) 4911 .sr(1) 4912 .m(6) 4913 .n(16) 4914 .k(k) 4915 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4916 } 4917 } 4918 } 4919 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)4920 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 4921 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4922 for (uint32_t n = 32; n <= 48; n += 16) { 4923 for (size_t k = 1; k <= 10; k += 3) { 4924 GemmMicrokernelTester() 4925 .mr(6) 4926 .nr(16) 4927 .kr(1) 4928 .sr(1) 4929 .m(6) 4930 .n(n) 4931 .k(k) 4932 .cn_stride(19) 4933 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4934 } 4935 } 4936 } 4937 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_a)4938 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) { 4939 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4940 for (uint32_t n = 32; n <= 48; n += 16) { 4941 for (size_t k = 1; k <= 10; k += 3) { 4942 GemmMicrokernelTester() 4943 .mr(6) 4944 .nr(16) 4945 .kr(1) 4946 .sr(1) 4947 .m(6) 4948 .n(n) 4949 .k(k) 4950 .a_stride(13) 4951 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4952 } 4953 } 4954 } 4955 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)4956 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 4957 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4958 for (uint32_t n = 32; n <= 48; n += 16) { 4959 for (size_t k = 1; k <= 10; k += 3) { 4960 for (uint32_t m = 1; m <= 6; m++) { 4961 GemmMicrokernelTester() 4962 .mr(6) 4963 .nr(16) 4964 .kr(1) 4965 .sr(1) 4966 .m(m) 4967 .n(n) 4968 .k(k) 4969 .iterations(1) 4970 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4971 } 4972 } 4973 } 4974 } 4975 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)4976 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 4977 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4978 for (size_t k = 1; k <= 10; k += 3) { 4979 for (uint32_t m = 1; m <= 6; m++) { 4980 for (uint32_t n = 1; n <= 16; n++) { 4981 GemmMicrokernelTester() 4982 .mr(6) 4983 .nr(16) 4984 .kr(1) 4985 .sr(1) 4986 .m(m) 4987 .n(n) 4988 .k(k) 4989 .cm_stride(19) 4990 .iterations(1) 4991 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 4992 } 4993 } 4994 } 4995 } 4996 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,qmin)4997 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 4998 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4999 GemmMicrokernelTester() 5000 .mr(6) 5001 .nr(16) 5002 .kr(1) 5003 .sr(1) 5004 .m(6) 5005 .n(16) 5006 .k(2) 5007 .qmin(128) 5008 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 5009 } 5010 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,qmax)5011 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 5012 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5013 GemmMicrokernelTester() 5014 .mr(6) 5015 .nr(16) 5016 .kr(1) 5017 .sr(1) 5018 .m(6) 5019 .n(16) 5020 .k(2) 5021 .qmax(128) 5022 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 5023 } 5024 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)5025 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 5026 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5027 GemmMicrokernelTester() 5028 .mr(6) 5029 .nr(16) 5030 .kr(1) 5031 .sr(1) 5032 .m(6) 5033 .n(16) 5034 .k(2) 5035 .cm_stride(19) 5036 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32); 5037 } 5038 #endif // XNN_ARCH_ARM64 5039 5040 5041 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4)5042 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 5043 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5044 GemmMicrokernelTester() 5045 .mr(1) 5046 .nr(8) 5047 .kr(1) 5048 .sr(1) 5049 .m(1) 5050 .n(8) 5051 .k(4) 5052 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5053 } 5054 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,strided_cn)5055 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 5056 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5057 GemmMicrokernelTester() 5058 .mr(1) 5059 .nr(8) 5060 .kr(1) 5061 .sr(1) 5062 .m(1) 5063 .n(8) 5064 .k(4) 5065 .cn_stride(11) 5066 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5067 } 5068 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)5069 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 5070 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5071 GemmMicrokernelTester() 5072 .mr(1) 5073 .nr(8) 5074 .kr(1) 5075 .sr(1) 5076 .m(1) 5077 .n(8) 5078 .k(4) 5079 .a_stride(7) 5080 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5081 } 5082 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)5083 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 5084 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5085 for (uint32_t m = 1; m <= 1; m++) { 5086 for (uint32_t n = 1; n <= 8; n++) { 5087 GemmMicrokernelTester() 5088 .mr(1) 5089 .nr(8) 5090 .kr(1) 5091 .sr(1) 5092 .m(m) 5093 .n(n) 5094 .k(4) 5095 .iterations(1) 5096 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5097 } 5098 } 5099 } 5100 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)5101 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 5102 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5103 for (uint32_t m = 1; m <= 1; m++) { 5104 GemmMicrokernelTester() 5105 .mr(1) 5106 .nr(8) 5107 .kr(1) 5108 .sr(1) 5109 .m(m) 5110 .n(8) 5111 .k(4) 5112 .iterations(1) 5113 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5114 } 5115 } 5116 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)5117 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 5118 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5119 for (uint32_t n = 1; n <= 8; n++) { 5120 GemmMicrokernelTester() 5121 .mr(1) 5122 .nr(8) 5123 .kr(1) 5124 .sr(1) 5125 .m(1) 5126 .n(n) 5127 .k(4) 5128 .iterations(1) 5129 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5130 } 5131 } 5132 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4)5133 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 5134 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5135 for (size_t k = 1; k < 4; k++) { 5136 GemmMicrokernelTester() 5137 .mr(1) 5138 .nr(8) 5139 .kr(1) 5140 .sr(1) 5141 .m(1) 5142 .n(8) 5143 .k(k) 5144 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5145 } 5146 } 5147 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)5148 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 5149 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5150 for (size_t k = 1; k < 4; k++) { 5151 GemmMicrokernelTester() 5152 .mr(1) 5153 .nr(8) 5154 .kr(1) 5155 .sr(1) 5156 .m(1) 5157 .n(8) 5158 .k(k) 5159 .a_stride(7) 5160 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5161 } 5162 } 5163 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)5164 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 5165 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5166 for (size_t k = 1; k < 4; k++) { 5167 for (uint32_t m = 1; m <= 1; m++) { 5168 for (uint32_t n = 1; n <= 8; n++) { 5169 GemmMicrokernelTester() 5170 .mr(1) 5171 .nr(8) 5172 .kr(1) 5173 .sr(1) 5174 .m(m) 5175 .n(n) 5176 .k(k) 5177 .iterations(1) 5178 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5179 } 5180 } 5181 } 5182 } 5183 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4)5184 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 5185 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5186 for (size_t k = 5; k < 8; k++) { 5187 GemmMicrokernelTester() 5188 .mr(1) 5189 .nr(8) 5190 .kr(1) 5191 .sr(1) 5192 .m(1) 5193 .n(8) 5194 .k(k) 5195 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5196 } 5197 } 5198 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)5199 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 5200 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5201 for (size_t k = 5; k < 8; k++) { 5202 GemmMicrokernelTester() 5203 .mr(1) 5204 .nr(8) 5205 .kr(1) 5206 .sr(1) 5207 .m(1) 5208 .n(8) 5209 .k(k) 5210 .a_stride(11) 5211 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5212 } 5213 } 5214 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)5215 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 5216 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5217 for (size_t k = 5; k < 8; k++) { 5218 for (uint32_t m = 1; m <= 1; m++) { 5219 for (uint32_t n = 1; n <= 8; n++) { 5220 GemmMicrokernelTester() 5221 .mr(1) 5222 .nr(8) 5223 .kr(1) 5224 .sr(1) 5225 .m(m) 5226 .n(n) 5227 .k(k) 5228 .iterations(1) 5229 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5230 } 5231 } 5232 } 5233 } 5234 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_div_4)5235 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 5236 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5237 for (size_t k = 8; k <= 40; k += 4) { 5238 GemmMicrokernelTester() 5239 .mr(1) 5240 .nr(8) 5241 .kr(1) 5242 .sr(1) 5243 .m(1) 5244 .n(8) 5245 .k(k) 5246 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5247 } 5248 } 5249 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)5250 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 5251 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5252 for (size_t k = 8; k <= 40; k += 4) { 5253 GemmMicrokernelTester() 5254 .mr(1) 5255 .nr(8) 5256 .kr(1) 5257 .sr(1) 5258 .m(1) 5259 .n(8) 5260 .k(k) 5261 .a_stride(43) 5262 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5263 } 5264 } 5265 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)5266 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 5267 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5268 for (size_t k = 8; k <= 40; k += 4) { 5269 for (uint32_t m = 1; m <= 1; m++) { 5270 for (uint32_t n = 1; n <= 8; n++) { 5271 GemmMicrokernelTester() 5272 .mr(1) 5273 .nr(8) 5274 .kr(1) 5275 .sr(1) 5276 .m(m) 5277 .n(n) 5278 .k(k) 5279 .iterations(1) 5280 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5281 } 5282 } 5283 } 5284 } 5285 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8)5286 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) { 5287 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5288 for (uint32_t n = 9; n < 16; n++) { 5289 for (size_t k = 1; k <= 20; k += 5) { 5290 GemmMicrokernelTester() 5291 .mr(1) 5292 .nr(8) 5293 .kr(1) 5294 .sr(1) 5295 .m(1) 5296 .n(8) 5297 .k(k) 5298 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5299 } 5300 } 5301 } 5302 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_cn)5303 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 5304 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5305 for (uint32_t n = 9; n < 16; n++) { 5306 for (size_t k = 1; k <= 20; k += 5) { 5307 GemmMicrokernelTester() 5308 .mr(1) 5309 .nr(8) 5310 .kr(1) 5311 .sr(1) 5312 .m(1) 5313 .n(8) 5314 .k(k) 5315 .cn_stride(11) 5316 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5317 } 5318 } 5319 } 5320 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_a)5321 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) { 5322 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5323 for (uint32_t n = 9; n < 16; n++) { 5324 for (size_t k = 1; k <= 20; k += 5) { 5325 GemmMicrokernelTester() 5326 .mr(1) 5327 .nr(8) 5328 .kr(1) 5329 .sr(1) 5330 .m(1) 5331 .n(n) 5332 .k(k) 5333 .a_stride(23) 5334 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5335 } 5336 } 5337 } 5338 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_subtile)5339 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) { 5340 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5341 for (uint32_t n = 9; n < 16; n++) { 5342 for (size_t k = 1; k <= 20; k += 5) { 5343 for (uint32_t m = 1; m <= 1; m++) { 5344 GemmMicrokernelTester() 5345 .mr(1) 5346 .nr(8) 5347 .kr(1) 5348 .sr(1) 5349 .m(m) 5350 .n(n) 5351 .k(k) 5352 .iterations(1) 5353 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5354 } 5355 } 5356 } 5357 } 5358 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_div_8)5359 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) { 5360 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5361 for (uint32_t n = 16; n <= 24; n += 8) { 5362 for (size_t k = 1; k <= 20; k += 5) { 5363 GemmMicrokernelTester() 5364 .mr(1) 5365 .nr(8) 5366 .kr(1) 5367 .sr(1) 5368 .m(1) 5369 .n(8) 5370 .k(k) 5371 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5372 } 5373 } 5374 } 5375 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_cn)5376 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) { 5377 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5378 for (uint32_t n = 16; n <= 24; n += 8) { 5379 for (size_t k = 1; k <= 20; k += 5) { 5380 GemmMicrokernelTester() 5381 .mr(1) 5382 .nr(8) 5383 .kr(1) 5384 .sr(1) 5385 .m(1) 5386 .n(n) 5387 .k(k) 5388 .cn_stride(11) 5389 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5390 } 5391 } 5392 } 5393 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_a)5394 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) { 5395 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5396 for (uint32_t n = 16; n <= 24; n += 8) { 5397 for (size_t k = 1; k <= 20; k += 5) { 5398 GemmMicrokernelTester() 5399 .mr(1) 5400 .nr(8) 5401 .kr(1) 5402 .sr(1) 5403 .m(1) 5404 .n(n) 5405 .k(k) 5406 .a_stride(23) 5407 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5408 } 5409 } 5410 } 5411 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_subtile)5412 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) { 5413 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5414 for (uint32_t n = 16; n <= 24; n += 8) { 5415 for (size_t k = 1; k <= 20; k += 5) { 5416 for (uint32_t m = 1; m <= 1; m++) { 5417 GemmMicrokernelTester() 5418 .mr(1) 5419 .nr(8) 5420 .kr(1) 5421 .sr(1) 5422 .m(m) 5423 .n(n) 5424 .k(k) 5425 .iterations(1) 5426 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5427 } 5428 } 5429 } 5430 } 5431 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)5432 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 5433 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5434 for (size_t k = 1; k <= 20; k += 5) { 5435 for (uint32_t m = 1; m <= 1; m++) { 5436 for (uint32_t n = 1; n <= 8; n++) { 5437 GemmMicrokernelTester() 5438 .mr(1) 5439 .nr(8) 5440 .kr(1) 5441 .sr(1) 5442 .m(m) 5443 .n(n) 5444 .k(k) 5445 .cm_stride(11) 5446 .iterations(1) 5447 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5448 } 5449 } 5450 } 5451 } 5452 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,qmin)5453 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, qmin) { 5454 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5455 GemmMicrokernelTester() 5456 .mr(1) 5457 .nr(8) 5458 .kr(1) 5459 .sr(1) 5460 .m(1) 5461 .n(8) 5462 .k(4) 5463 .qmin(128) 5464 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5465 } 5466 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,qmax)5467 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, qmax) { 5468 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5469 GemmMicrokernelTester() 5470 .mr(1) 5471 .nr(8) 5472 .kr(1) 5473 .sr(1) 5474 .m(1) 5475 .n(8) 5476 .k(4) 5477 .qmax(128) 5478 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5479 } 5480 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64,strided_cm)5481 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 5482 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5483 GemmMicrokernelTester() 5484 .mr(1) 5485 .nr(8) 5486 .kr(1) 5487 .sr(1) 5488 .m(1) 5489 .n(8) 5490 .k(4) 5491 .cm_stride(11) 5492 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64); 5493 } 5494 #endif // XNN_ARCH_ARM64 5495 5496 5497 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4)5498 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 5499 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5500 GemmMicrokernelTester() 5501 .mr(4) 5502 .nr(8) 5503 .kr(1) 5504 .sr(1) 5505 .m(4) 5506 .n(8) 5507 .k(4) 5508 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5509 } 5510 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,strided_cn)5511 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 5512 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5513 GemmMicrokernelTester() 5514 .mr(4) 5515 .nr(8) 5516 .kr(1) 5517 .sr(1) 5518 .m(4) 5519 .n(8) 5520 .k(4) 5521 .cn_stride(11) 5522 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5523 } 5524 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)5525 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 5526 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5527 GemmMicrokernelTester() 5528 .mr(4) 5529 .nr(8) 5530 .kr(1) 5531 .sr(1) 5532 .m(4) 5533 .n(8) 5534 .k(4) 5535 .a_stride(7) 5536 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5537 } 5538 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)5539 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 5540 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5541 for (uint32_t m = 1; m <= 4; m++) { 5542 for (uint32_t n = 1; n <= 8; n++) { 5543 GemmMicrokernelTester() 5544 .mr(4) 5545 .nr(8) 5546 .kr(1) 5547 .sr(1) 5548 .m(m) 5549 .n(n) 5550 .k(4) 5551 .iterations(1) 5552 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5553 } 5554 } 5555 } 5556 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)5557 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 5558 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5559 for (uint32_t m = 1; m <= 4; m++) { 5560 GemmMicrokernelTester() 5561 .mr(4) 5562 .nr(8) 5563 .kr(1) 5564 .sr(1) 5565 .m(m) 5566 .n(8) 5567 .k(4) 5568 .iterations(1) 5569 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5570 } 5571 } 5572 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)5573 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 5574 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5575 for (uint32_t n = 1; n <= 8; n++) { 5576 GemmMicrokernelTester() 5577 .mr(4) 5578 .nr(8) 5579 .kr(1) 5580 .sr(1) 5581 .m(4) 5582 .n(n) 5583 .k(4) 5584 .iterations(1) 5585 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5586 } 5587 } 5588 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4)5589 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 5590 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5591 for (size_t k = 1; k < 4; k++) { 5592 GemmMicrokernelTester() 5593 .mr(4) 5594 .nr(8) 5595 .kr(1) 5596 .sr(1) 5597 .m(4) 5598 .n(8) 5599 .k(k) 5600 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5601 } 5602 } 5603 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)5604 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 5605 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5606 for (size_t k = 1; k < 4; k++) { 5607 GemmMicrokernelTester() 5608 .mr(4) 5609 .nr(8) 5610 .kr(1) 5611 .sr(1) 5612 .m(4) 5613 .n(8) 5614 .k(k) 5615 .a_stride(7) 5616 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5617 } 5618 } 5619 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)5620 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 5621 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5622 for (size_t k = 1; k < 4; k++) { 5623 for (uint32_t m = 1; m <= 4; m++) { 5624 for (uint32_t n = 1; n <= 8; n++) { 5625 GemmMicrokernelTester() 5626 .mr(4) 5627 .nr(8) 5628 .kr(1) 5629 .sr(1) 5630 .m(m) 5631 .n(n) 5632 .k(k) 5633 .iterations(1) 5634 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5635 } 5636 } 5637 } 5638 } 5639 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4)5640 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 5641 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5642 for (size_t k = 5; k < 8; k++) { 5643 GemmMicrokernelTester() 5644 .mr(4) 5645 .nr(8) 5646 .kr(1) 5647 .sr(1) 5648 .m(4) 5649 .n(8) 5650 .k(k) 5651 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5652 } 5653 } 5654 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)5655 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 5656 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5657 for (size_t k = 5; k < 8; k++) { 5658 GemmMicrokernelTester() 5659 .mr(4) 5660 .nr(8) 5661 .kr(1) 5662 .sr(1) 5663 .m(4) 5664 .n(8) 5665 .k(k) 5666 .a_stride(11) 5667 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5668 } 5669 } 5670 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)5671 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 5672 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5673 for (size_t k = 5; k < 8; k++) { 5674 for (uint32_t m = 1; m <= 4; m++) { 5675 for (uint32_t n = 1; n <= 8; n++) { 5676 GemmMicrokernelTester() 5677 .mr(4) 5678 .nr(8) 5679 .kr(1) 5680 .sr(1) 5681 .m(m) 5682 .n(n) 5683 .k(k) 5684 .iterations(1) 5685 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5686 } 5687 } 5688 } 5689 } 5690 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_div_4)5691 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 5692 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5693 for (size_t k = 8; k <= 40; k += 4) { 5694 GemmMicrokernelTester() 5695 .mr(4) 5696 .nr(8) 5697 .kr(1) 5698 .sr(1) 5699 .m(4) 5700 .n(8) 5701 .k(k) 5702 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5703 } 5704 } 5705 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)5706 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 5707 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5708 for (size_t k = 8; k <= 40; k += 4) { 5709 GemmMicrokernelTester() 5710 .mr(4) 5711 .nr(8) 5712 .kr(1) 5713 .sr(1) 5714 .m(4) 5715 .n(8) 5716 .k(k) 5717 .a_stride(43) 5718 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5719 } 5720 } 5721 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)5722 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 5723 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5724 for (size_t k = 8; k <= 40; k += 4) { 5725 for (uint32_t m = 1; m <= 4; m++) { 5726 for (uint32_t n = 1; n <= 8; n++) { 5727 GemmMicrokernelTester() 5728 .mr(4) 5729 .nr(8) 5730 .kr(1) 5731 .sr(1) 5732 .m(m) 5733 .n(n) 5734 .k(k) 5735 .iterations(1) 5736 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5737 } 5738 } 5739 } 5740 } 5741 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8)5742 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) { 5743 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5744 for (uint32_t n = 9; n < 16; n++) { 5745 for (size_t k = 1; k <= 20; k += 5) { 5746 GemmMicrokernelTester() 5747 .mr(4) 5748 .nr(8) 5749 .kr(1) 5750 .sr(1) 5751 .m(4) 5752 .n(8) 5753 .k(k) 5754 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5755 } 5756 } 5757 } 5758 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_cn)5759 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 5760 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5761 for (uint32_t n = 9; n < 16; n++) { 5762 for (size_t k = 1; k <= 20; k += 5) { 5763 GemmMicrokernelTester() 5764 .mr(4) 5765 .nr(8) 5766 .kr(1) 5767 .sr(1) 5768 .m(4) 5769 .n(8) 5770 .k(k) 5771 .cn_stride(11) 5772 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5773 } 5774 } 5775 } 5776 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_a)5777 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) { 5778 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5779 for (uint32_t n = 9; n < 16; n++) { 5780 for (size_t k = 1; k <= 20; k += 5) { 5781 GemmMicrokernelTester() 5782 .mr(4) 5783 .nr(8) 5784 .kr(1) 5785 .sr(1) 5786 .m(4) 5787 .n(n) 5788 .k(k) 5789 .a_stride(23) 5790 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5791 } 5792 } 5793 } 5794 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_subtile)5795 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) { 5796 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5797 for (uint32_t n = 9; n < 16; n++) { 5798 for (size_t k = 1; k <= 20; k += 5) { 5799 for (uint32_t m = 1; m <= 4; m++) { 5800 GemmMicrokernelTester() 5801 .mr(4) 5802 .nr(8) 5803 .kr(1) 5804 .sr(1) 5805 .m(m) 5806 .n(n) 5807 .k(k) 5808 .iterations(1) 5809 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5810 } 5811 } 5812 } 5813 } 5814 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_div_8)5815 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) { 5816 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5817 for (uint32_t n = 16; n <= 24; n += 8) { 5818 for (size_t k = 1; k <= 20; k += 5) { 5819 GemmMicrokernelTester() 5820 .mr(4) 5821 .nr(8) 5822 .kr(1) 5823 .sr(1) 5824 .m(4) 5825 .n(8) 5826 .k(k) 5827 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5828 } 5829 } 5830 } 5831 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_cn)5832 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) { 5833 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5834 for (uint32_t n = 16; n <= 24; n += 8) { 5835 for (size_t k = 1; k <= 20; k += 5) { 5836 GemmMicrokernelTester() 5837 .mr(4) 5838 .nr(8) 5839 .kr(1) 5840 .sr(1) 5841 .m(4) 5842 .n(n) 5843 .k(k) 5844 .cn_stride(11) 5845 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5846 } 5847 } 5848 } 5849 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_a)5850 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) { 5851 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5852 for (uint32_t n = 16; n <= 24; n += 8) { 5853 for (size_t k = 1; k <= 20; k += 5) { 5854 GemmMicrokernelTester() 5855 .mr(4) 5856 .nr(8) 5857 .kr(1) 5858 .sr(1) 5859 .m(4) 5860 .n(n) 5861 .k(k) 5862 .a_stride(23) 5863 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5864 } 5865 } 5866 } 5867 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_subtile)5868 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) { 5869 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5870 for (uint32_t n = 16; n <= 24; n += 8) { 5871 for (size_t k = 1; k <= 20; k += 5) { 5872 for (uint32_t m = 1; m <= 4; m++) { 5873 GemmMicrokernelTester() 5874 .mr(4) 5875 .nr(8) 5876 .kr(1) 5877 .sr(1) 5878 .m(m) 5879 .n(n) 5880 .k(k) 5881 .iterations(1) 5882 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5883 } 5884 } 5885 } 5886 } 5887 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)5888 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 5889 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5890 for (size_t k = 1; k <= 20; k += 5) { 5891 for (uint32_t m = 1; m <= 4; m++) { 5892 for (uint32_t n = 1; n <= 8; n++) { 5893 GemmMicrokernelTester() 5894 .mr(4) 5895 .nr(8) 5896 .kr(1) 5897 .sr(1) 5898 .m(m) 5899 .n(n) 5900 .k(k) 5901 .cm_stride(11) 5902 .iterations(1) 5903 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5904 } 5905 } 5906 } 5907 } 5908 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,qmin)5909 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, qmin) { 5910 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5911 GemmMicrokernelTester() 5912 .mr(4) 5913 .nr(8) 5914 .kr(1) 5915 .sr(1) 5916 .m(4) 5917 .n(8) 5918 .k(4) 5919 .qmin(128) 5920 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5921 } 5922 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,qmax)5923 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, qmax) { 5924 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5925 GemmMicrokernelTester() 5926 .mr(4) 5927 .nr(8) 5928 .kr(1) 5929 .sr(1) 5930 .m(4) 5931 .n(8) 5932 .k(4) 5933 .qmax(128) 5934 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5935 } 5936 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64,strided_cm)5937 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 5938 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5939 GemmMicrokernelTester() 5940 .mr(4) 5941 .nr(8) 5942 .kr(1) 5943 .sr(1) 5944 .m(4) 5945 .n(8) 5946 .k(4) 5947 .cm_stride(11) 5948 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64); 5949 } 5950 #endif // XNN_ARCH_ARM64 5951 5952 5953 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4)5954 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 5955 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5956 GemmMicrokernelTester() 5957 .mr(6) 5958 .nr(8) 5959 .kr(1) 5960 .sr(1) 5961 .m(6) 5962 .n(8) 5963 .k(4) 5964 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 5965 } 5966 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,strided_cn)5967 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 5968 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5969 GemmMicrokernelTester() 5970 .mr(6) 5971 .nr(8) 5972 .kr(1) 5973 .sr(1) 5974 .m(6) 5975 .n(8) 5976 .k(4) 5977 .cn_stride(11) 5978 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 5979 } 5980 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)5981 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 5982 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5983 GemmMicrokernelTester() 5984 .mr(6) 5985 .nr(8) 5986 .kr(1) 5987 .sr(1) 5988 .m(6) 5989 .n(8) 5990 .k(4) 5991 .a_stride(7) 5992 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 5993 } 5994 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)5995 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 5996 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5997 for (uint32_t m = 1; m <= 6; m++) { 5998 for (uint32_t n = 1; n <= 8; n++) { 5999 GemmMicrokernelTester() 6000 .mr(6) 6001 .nr(8) 6002 .kr(1) 6003 .sr(1) 6004 .m(m) 6005 .n(n) 6006 .k(4) 6007 .iterations(1) 6008 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6009 } 6010 } 6011 } 6012 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)6013 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 6014 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6015 for (uint32_t m = 1; m <= 6; m++) { 6016 GemmMicrokernelTester() 6017 .mr(6) 6018 .nr(8) 6019 .kr(1) 6020 .sr(1) 6021 .m(m) 6022 .n(8) 6023 .k(4) 6024 .iterations(1) 6025 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6026 } 6027 } 6028 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)6029 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 6030 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6031 for (uint32_t n = 1; n <= 8; n++) { 6032 GemmMicrokernelTester() 6033 .mr(6) 6034 .nr(8) 6035 .kr(1) 6036 .sr(1) 6037 .m(6) 6038 .n(n) 6039 .k(4) 6040 .iterations(1) 6041 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6042 } 6043 } 6044 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4)6045 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 6046 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6047 for (size_t k = 1; k < 4; k++) { 6048 GemmMicrokernelTester() 6049 .mr(6) 6050 .nr(8) 6051 .kr(1) 6052 .sr(1) 6053 .m(6) 6054 .n(8) 6055 .k(k) 6056 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6057 } 6058 } 6059 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)6060 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 6061 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6062 for (size_t k = 1; k < 4; k++) { 6063 GemmMicrokernelTester() 6064 .mr(6) 6065 .nr(8) 6066 .kr(1) 6067 .sr(1) 6068 .m(6) 6069 .n(8) 6070 .k(k) 6071 .a_stride(7) 6072 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6073 } 6074 } 6075 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)6076 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 6077 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6078 for (size_t k = 1; k < 4; k++) { 6079 for (uint32_t m = 1; m <= 6; m++) { 6080 for (uint32_t n = 1; n <= 8; n++) { 6081 GemmMicrokernelTester() 6082 .mr(6) 6083 .nr(8) 6084 .kr(1) 6085 .sr(1) 6086 .m(m) 6087 .n(n) 6088 .k(k) 6089 .iterations(1) 6090 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6091 } 6092 } 6093 } 6094 } 6095 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4)6096 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 6097 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6098 for (size_t k = 5; k < 8; k++) { 6099 GemmMicrokernelTester() 6100 .mr(6) 6101 .nr(8) 6102 .kr(1) 6103 .sr(1) 6104 .m(6) 6105 .n(8) 6106 .k(k) 6107 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6108 } 6109 } 6110 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)6111 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 6112 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6113 for (size_t k = 5; k < 8; k++) { 6114 GemmMicrokernelTester() 6115 .mr(6) 6116 .nr(8) 6117 .kr(1) 6118 .sr(1) 6119 .m(6) 6120 .n(8) 6121 .k(k) 6122 .a_stride(11) 6123 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6124 } 6125 } 6126 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)6127 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 6128 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6129 for (size_t k = 5; k < 8; k++) { 6130 for (uint32_t m = 1; m <= 6; m++) { 6131 for (uint32_t n = 1; n <= 8; n++) { 6132 GemmMicrokernelTester() 6133 .mr(6) 6134 .nr(8) 6135 .kr(1) 6136 .sr(1) 6137 .m(m) 6138 .n(n) 6139 .k(k) 6140 .iterations(1) 6141 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6142 } 6143 } 6144 } 6145 } 6146 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_div_4)6147 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 6148 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6149 for (size_t k = 8; k <= 40; k += 4) { 6150 GemmMicrokernelTester() 6151 .mr(6) 6152 .nr(8) 6153 .kr(1) 6154 .sr(1) 6155 .m(6) 6156 .n(8) 6157 .k(k) 6158 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6159 } 6160 } 6161 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)6162 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 6163 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6164 for (size_t k = 8; k <= 40; k += 4) { 6165 GemmMicrokernelTester() 6166 .mr(6) 6167 .nr(8) 6168 .kr(1) 6169 .sr(1) 6170 .m(6) 6171 .n(8) 6172 .k(k) 6173 .a_stride(43) 6174 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6175 } 6176 } 6177 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)6178 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 6179 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6180 for (size_t k = 8; k <= 40; k += 4) { 6181 for (uint32_t m = 1; m <= 6; m++) { 6182 for (uint32_t n = 1; n <= 8; n++) { 6183 GemmMicrokernelTester() 6184 .mr(6) 6185 .nr(8) 6186 .kr(1) 6187 .sr(1) 6188 .m(m) 6189 .n(n) 6190 .k(k) 6191 .iterations(1) 6192 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6193 } 6194 } 6195 } 6196 } 6197 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8)6198 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) { 6199 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6200 for (uint32_t n = 9; n < 16; n++) { 6201 for (size_t k = 1; k <= 20; k += 5) { 6202 GemmMicrokernelTester() 6203 .mr(6) 6204 .nr(8) 6205 .kr(1) 6206 .sr(1) 6207 .m(6) 6208 .n(8) 6209 .k(k) 6210 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6211 } 6212 } 6213 } 6214 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_cn)6215 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 6216 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6217 for (uint32_t n = 9; n < 16; n++) { 6218 for (size_t k = 1; k <= 20; k += 5) { 6219 GemmMicrokernelTester() 6220 .mr(6) 6221 .nr(8) 6222 .kr(1) 6223 .sr(1) 6224 .m(6) 6225 .n(8) 6226 .k(k) 6227 .cn_stride(11) 6228 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6229 } 6230 } 6231 } 6232 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_a)6233 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) { 6234 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6235 for (uint32_t n = 9; n < 16; n++) { 6236 for (size_t k = 1; k <= 20; k += 5) { 6237 GemmMicrokernelTester() 6238 .mr(6) 6239 .nr(8) 6240 .kr(1) 6241 .sr(1) 6242 .m(6) 6243 .n(n) 6244 .k(k) 6245 .a_stride(23) 6246 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6247 } 6248 } 6249 } 6250 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_subtile)6251 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) { 6252 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6253 for (uint32_t n = 9; n < 16; n++) { 6254 for (size_t k = 1; k <= 20; k += 5) { 6255 for (uint32_t m = 1; m <= 6; m++) { 6256 GemmMicrokernelTester() 6257 .mr(6) 6258 .nr(8) 6259 .kr(1) 6260 .sr(1) 6261 .m(m) 6262 .n(n) 6263 .k(k) 6264 .iterations(1) 6265 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6266 } 6267 } 6268 } 6269 } 6270 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_div_8)6271 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) { 6272 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6273 for (uint32_t n = 16; n <= 24; n += 8) { 6274 for (size_t k = 1; k <= 20; k += 5) { 6275 GemmMicrokernelTester() 6276 .mr(6) 6277 .nr(8) 6278 .kr(1) 6279 .sr(1) 6280 .m(6) 6281 .n(8) 6282 .k(k) 6283 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6284 } 6285 } 6286 } 6287 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_cn)6288 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) { 6289 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6290 for (uint32_t n = 16; n <= 24; n += 8) { 6291 for (size_t k = 1; k <= 20; k += 5) { 6292 GemmMicrokernelTester() 6293 .mr(6) 6294 .nr(8) 6295 .kr(1) 6296 .sr(1) 6297 .m(6) 6298 .n(n) 6299 .k(k) 6300 .cn_stride(11) 6301 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6302 } 6303 } 6304 } 6305 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_a)6306 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) { 6307 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6308 for (uint32_t n = 16; n <= 24; n += 8) { 6309 for (size_t k = 1; k <= 20; k += 5) { 6310 GemmMicrokernelTester() 6311 .mr(6) 6312 .nr(8) 6313 .kr(1) 6314 .sr(1) 6315 .m(6) 6316 .n(n) 6317 .k(k) 6318 .a_stride(23) 6319 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6320 } 6321 } 6322 } 6323 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_subtile)6324 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) { 6325 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6326 for (uint32_t n = 16; n <= 24; n += 8) { 6327 for (size_t k = 1; k <= 20; k += 5) { 6328 for (uint32_t m = 1; m <= 6; m++) { 6329 GemmMicrokernelTester() 6330 .mr(6) 6331 .nr(8) 6332 .kr(1) 6333 .sr(1) 6334 .m(m) 6335 .n(n) 6336 .k(k) 6337 .iterations(1) 6338 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6339 } 6340 } 6341 } 6342 } 6343 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)6344 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 6345 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6346 for (size_t k = 1; k <= 20; k += 5) { 6347 for (uint32_t m = 1; m <= 6; m++) { 6348 for (uint32_t n = 1; n <= 8; n++) { 6349 GemmMicrokernelTester() 6350 .mr(6) 6351 .nr(8) 6352 .kr(1) 6353 .sr(1) 6354 .m(m) 6355 .n(n) 6356 .k(k) 6357 .cm_stride(11) 6358 .iterations(1) 6359 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6360 } 6361 } 6362 } 6363 } 6364 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,qmin)6365 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, qmin) { 6366 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6367 GemmMicrokernelTester() 6368 .mr(6) 6369 .nr(8) 6370 .kr(1) 6371 .sr(1) 6372 .m(6) 6373 .n(8) 6374 .k(4) 6375 .qmin(128) 6376 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6377 } 6378 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,qmax)6379 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, qmax) { 6380 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6381 GemmMicrokernelTester() 6382 .mr(6) 6383 .nr(8) 6384 .kr(1) 6385 .sr(1) 6386 .m(6) 6387 .n(8) 6388 .k(4) 6389 .qmax(128) 6390 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6391 } 6392 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64,strided_cm)6393 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 6394 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6395 GemmMicrokernelTester() 6396 .mr(6) 6397 .nr(8) 6398 .kr(1) 6399 .sr(1) 6400 .m(6) 6401 .n(8) 6402 .k(4) 6403 .cm_stride(11) 6404 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64); 6405 } 6406 #endif // XNN_ARCH_ARM64 6407 6408 6409 #if XNN_ARCH_ARM64 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4)6410 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 6411 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6412 GemmMicrokernelTester() 6413 .mr(8) 6414 .nr(8) 6415 .kr(1) 6416 .sr(1) 6417 .m(8) 6418 .n(8) 6419 .k(4) 6420 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6421 } 6422 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,strided_cn)6423 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 6424 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6425 GemmMicrokernelTester() 6426 .mr(8) 6427 .nr(8) 6428 .kr(1) 6429 .sr(1) 6430 .m(8) 6431 .n(8) 6432 .k(4) 6433 .cn_stride(11) 6434 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6435 } 6436 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_strided_a)6437 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) { 6438 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6439 GemmMicrokernelTester() 6440 .mr(8) 6441 .nr(8) 6442 .kr(1) 6443 .sr(1) 6444 .m(8) 6445 .n(8) 6446 .k(4) 6447 .a_stride(7) 6448 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6449 } 6450 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)6451 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 6452 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6453 for (uint32_t m = 1; m <= 8; m++) { 6454 for (uint32_t n = 1; n <= 8; n++) { 6455 GemmMicrokernelTester() 6456 .mr(8) 6457 .nr(8) 6458 .kr(1) 6459 .sr(1) 6460 .m(m) 6461 .n(n) 6462 .k(4) 6463 .iterations(1) 6464 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6465 } 6466 } 6467 } 6468 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)6469 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 6470 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6471 for (uint32_t m = 1; m <= 8; m++) { 6472 GemmMicrokernelTester() 6473 .mr(8) 6474 .nr(8) 6475 .kr(1) 6476 .sr(1) 6477 .m(m) 6478 .n(8) 6479 .k(4) 6480 .iterations(1) 6481 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6482 } 6483 } 6484 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)6485 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 6486 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6487 for (uint32_t n = 1; n <= 8; n++) { 6488 GemmMicrokernelTester() 6489 .mr(8) 6490 .nr(8) 6491 .kr(1) 6492 .sr(1) 6493 .m(8) 6494 .n(n) 6495 .k(4) 6496 .iterations(1) 6497 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6498 } 6499 } 6500 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4)6501 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 6502 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6503 for (size_t k = 1; k < 4; k++) { 6504 GemmMicrokernelTester() 6505 .mr(8) 6506 .nr(8) 6507 .kr(1) 6508 .sr(1) 6509 .m(8) 6510 .n(8) 6511 .k(k) 6512 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6513 } 6514 } 6515 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_strided_a)6516 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) { 6517 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6518 for (size_t k = 1; k < 4; k++) { 6519 GemmMicrokernelTester() 6520 .mr(8) 6521 .nr(8) 6522 .kr(1) 6523 .sr(1) 6524 .m(8) 6525 .n(8) 6526 .k(k) 6527 .a_stride(7) 6528 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6529 } 6530 } 6531 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)6532 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 6533 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6534 for (size_t k = 1; k < 4; k++) { 6535 for (uint32_t m = 1; m <= 8; m++) { 6536 for (uint32_t n = 1; n <= 8; n++) { 6537 GemmMicrokernelTester() 6538 .mr(8) 6539 .nr(8) 6540 .kr(1) 6541 .sr(1) 6542 .m(m) 6543 .n(n) 6544 .k(k) 6545 .iterations(1) 6546 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6547 } 6548 } 6549 } 6550 } 6551 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4)6552 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 6553 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6554 for (size_t k = 5; k < 8; k++) { 6555 GemmMicrokernelTester() 6556 .mr(8) 6557 .nr(8) 6558 .kr(1) 6559 .sr(1) 6560 .m(8) 6561 .n(8) 6562 .k(k) 6563 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6564 } 6565 } 6566 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_strided_a)6567 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) { 6568 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6569 for (size_t k = 5; k < 8; k++) { 6570 GemmMicrokernelTester() 6571 .mr(8) 6572 .nr(8) 6573 .kr(1) 6574 .sr(1) 6575 .m(8) 6576 .n(8) 6577 .k(k) 6578 .a_stride(11) 6579 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6580 } 6581 } 6582 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)6583 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 6584 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6585 for (size_t k = 5; k < 8; k++) { 6586 for (uint32_t m = 1; m <= 8; m++) { 6587 for (uint32_t n = 1; n <= 8; n++) { 6588 GemmMicrokernelTester() 6589 .mr(8) 6590 .nr(8) 6591 .kr(1) 6592 .sr(1) 6593 .m(m) 6594 .n(n) 6595 .k(k) 6596 .iterations(1) 6597 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6598 } 6599 } 6600 } 6601 } 6602 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_div_4)6603 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 6604 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6605 for (size_t k = 8; k <= 40; k += 4) { 6606 GemmMicrokernelTester() 6607 .mr(8) 6608 .nr(8) 6609 .kr(1) 6610 .sr(1) 6611 .m(8) 6612 .n(8) 6613 .k(k) 6614 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6615 } 6616 } 6617 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_strided_a)6618 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) { 6619 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6620 for (size_t k = 8; k <= 40; k += 4) { 6621 GemmMicrokernelTester() 6622 .mr(8) 6623 .nr(8) 6624 .kr(1) 6625 .sr(1) 6626 .m(8) 6627 .n(8) 6628 .k(k) 6629 .a_stride(43) 6630 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6631 } 6632 } 6633 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)6634 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 6635 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6636 for (size_t k = 8; k <= 40; k += 4) { 6637 for (uint32_t m = 1; m <= 8; m++) { 6638 for (uint32_t n = 1; n <= 8; n++) { 6639 GemmMicrokernelTester() 6640 .mr(8) 6641 .nr(8) 6642 .kr(1) 6643 .sr(1) 6644 .m(m) 6645 .n(n) 6646 .k(k) 6647 .iterations(1) 6648 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6649 } 6650 } 6651 } 6652 } 6653 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8)6654 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) { 6655 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6656 for (uint32_t n = 9; n < 16; n++) { 6657 for (size_t k = 1; k <= 20; k += 5) { 6658 GemmMicrokernelTester() 6659 .mr(8) 6660 .nr(8) 6661 .kr(1) 6662 .sr(1) 6663 .m(8) 6664 .n(8) 6665 .k(k) 6666 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6667 } 6668 } 6669 } 6670 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_cn)6671 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 6672 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6673 for (uint32_t n = 9; n < 16; n++) { 6674 for (size_t k = 1; k <= 20; k += 5) { 6675 GemmMicrokernelTester() 6676 .mr(8) 6677 .nr(8) 6678 .kr(1) 6679 .sr(1) 6680 .m(8) 6681 .n(8) 6682 .k(k) 6683 .cn_stride(11) 6684 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6685 } 6686 } 6687 } 6688 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_strided_a)6689 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) { 6690 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6691 for (uint32_t n = 9; n < 16; n++) { 6692 for (size_t k = 1; k <= 20; k += 5) { 6693 GemmMicrokernelTester() 6694 .mr(8) 6695 .nr(8) 6696 .kr(1) 6697 .sr(1) 6698 .m(8) 6699 .n(n) 6700 .k(k) 6701 .a_stride(23) 6702 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6703 } 6704 } 6705 } 6706 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_gt_8_subtile)6707 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) { 6708 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6709 for (uint32_t n = 9; n < 16; n++) { 6710 for (size_t k = 1; k <= 20; k += 5) { 6711 for (uint32_t m = 1; m <= 8; m++) { 6712 GemmMicrokernelTester() 6713 .mr(8) 6714 .nr(8) 6715 .kr(1) 6716 .sr(1) 6717 .m(m) 6718 .n(n) 6719 .k(k) 6720 .iterations(1) 6721 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6722 } 6723 } 6724 } 6725 } 6726 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_div_8)6727 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) { 6728 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6729 for (uint32_t n = 16; n <= 24; n += 8) { 6730 for (size_t k = 1; k <= 20; k += 5) { 6731 GemmMicrokernelTester() 6732 .mr(8) 6733 .nr(8) 6734 .kr(1) 6735 .sr(1) 6736 .m(8) 6737 .n(8) 6738 .k(k) 6739 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6740 } 6741 } 6742 } 6743 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_cn)6744 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) { 6745 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6746 for (uint32_t n = 16; n <= 24; n += 8) { 6747 for (size_t k = 1; k <= 20; k += 5) { 6748 GemmMicrokernelTester() 6749 .mr(8) 6750 .nr(8) 6751 .kr(1) 6752 .sr(1) 6753 .m(8) 6754 .n(n) 6755 .k(k) 6756 .cn_stride(11) 6757 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6758 } 6759 } 6760 } 6761 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_strided_a)6762 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) { 6763 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6764 for (uint32_t n = 16; n <= 24; n += 8) { 6765 for (size_t k = 1; k <= 20; k += 5) { 6766 GemmMicrokernelTester() 6767 .mr(8) 6768 .nr(8) 6769 .kr(1) 6770 .sr(1) 6771 .m(8) 6772 .n(n) 6773 .k(k) 6774 .a_stride(23) 6775 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6776 } 6777 } 6778 } 6779 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,n_div_8_subtile)6780 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) { 6781 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6782 for (uint32_t n = 16; n <= 24; n += 8) { 6783 for (size_t k = 1; k <= 20; k += 5) { 6784 for (uint32_t m = 1; m <= 8; m++) { 6785 GemmMicrokernelTester() 6786 .mr(8) 6787 .nr(8) 6788 .kr(1) 6789 .sr(1) 6790 .m(m) 6791 .n(n) 6792 .k(k) 6793 .iterations(1) 6794 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6795 } 6796 } 6797 } 6798 } 6799 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)6800 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 6801 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6802 for (size_t k = 1; k <= 20; k += 5) { 6803 for (uint32_t m = 1; m <= 8; m++) { 6804 for (uint32_t n = 1; n <= 8; n++) { 6805 GemmMicrokernelTester() 6806 .mr(8) 6807 .nr(8) 6808 .kr(1) 6809 .sr(1) 6810 .m(m) 6811 .n(n) 6812 .k(k) 6813 .cm_stride(11) 6814 .iterations(1) 6815 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6816 } 6817 } 6818 } 6819 } 6820 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,qmin)6821 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, qmin) { 6822 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6823 GemmMicrokernelTester() 6824 .mr(8) 6825 .nr(8) 6826 .kr(1) 6827 .sr(1) 6828 .m(8) 6829 .n(8) 6830 .k(4) 6831 .qmin(128) 6832 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6833 } 6834 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,qmax)6835 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, qmax) { 6836 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6837 GemmMicrokernelTester() 6838 .mr(8) 6839 .nr(8) 6840 .kr(1) 6841 .sr(1) 6842 .m(8) 6843 .n(8) 6844 .k(4) 6845 .qmax(128) 6846 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6847 } 6848 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64,strided_cm)6849 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 6850 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6851 GemmMicrokernelTester() 6852 .mr(8) 6853 .nr(8) 6854 .kr(1) 6855 .sr(1) 6856 .m(8) 6857 .n(8) 6858 .k(4) 6859 .cm_stride(11) 6860 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64); 6861 } 6862 #endif // XNN_ARCH_ARM64 6863