1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 // 9 // Auto-generated file. Do not edit! 10 // Specification: test/bf16-gemm-minmax.yaml 11 // Generator: tools/generate-gemm-test.py 12 13 14 #include <gtest/gtest.h> 15 16 #include <xnnpack/allocator.h> 17 #include <xnnpack/common.h> 18 #include <xnnpack/isa-checks.h> 19 #include <xnnpack/microparams-init.h> 20 21 #include <xnnpack/gemm.h> 22 #include <xnnpack/igemm.h> 23 #include <xnnpack/ppmm.h> 24 #include "gemm-microkernel-tester.h" 25 26 27 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_eq_8)28 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_eq_8) { 29 TEST_REQUIRES_ARM_NEON_FMA; 30 GemmMicrokernelTester() 31 .mr(1) 32 .nr(4) 33 .kr(8) 34 .sr(1) 35 .m(1) 36 .n(4) 37 .k(8) 38 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 39 } 40 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,strided_cn)41 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, strided_cn) { 42 TEST_REQUIRES_ARM_NEON_FMA; 43 GemmMicrokernelTester() 44 .mr(1) 45 .nr(4) 46 .kr(8) 47 .sr(1) 48 .m(1) 49 .n(4) 50 .k(8) 51 .cn_stride(7) 52 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 53 } 54 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_eq_8_strided_a)55 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_eq_8_strided_a) { 56 TEST_REQUIRES_ARM_NEON_FMA; 57 GemmMicrokernelTester() 58 .mr(1) 59 .nr(4) 60 .kr(8) 61 .sr(1) 62 .m(1) 63 .n(4) 64 .k(8) 65 .a_stride(11) 66 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 67 } 68 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_eq_8_subtile)69 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_eq_8_subtile) { 70 TEST_REQUIRES_ARM_NEON_FMA; 71 for (uint32_t n = 1; n <= 4; n++) { 72 for (uint32_t m = 1; m <= 1; m++) { 73 GemmMicrokernelTester() 74 .mr(1) 75 .nr(4) 76 .kr(8) 77 .sr(1) 78 .m(m) 79 .n(n) 80 .k(8) 81 .iterations(1) 82 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 83 } 84 } 85 } 86 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_eq_8_subtile_m)87 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_eq_8_subtile_m) { 88 TEST_REQUIRES_ARM_NEON_FMA; 89 for (uint32_t m = 1; m <= 1; m++) { 90 GemmMicrokernelTester() 91 .mr(1) 92 .nr(4) 93 .kr(8) 94 .sr(1) 95 .m(m) 96 .n(4) 97 .k(8) 98 .iterations(1) 99 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 100 } 101 } 102 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_eq_8_subtile_n)103 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_eq_8_subtile_n) { 104 TEST_REQUIRES_ARM_NEON_FMA; 105 for (uint32_t n = 1; n <= 4; n++) { 106 GemmMicrokernelTester() 107 .mr(1) 108 .nr(4) 109 .kr(8) 110 .sr(1) 111 .m(1) 112 .n(n) 113 .k(8) 114 .iterations(1) 115 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 116 } 117 } 118 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_lt_8)119 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_lt_8) { 120 TEST_REQUIRES_ARM_NEON_FMA; 121 for (size_t k = 1; k < 8; k++) { 122 GemmMicrokernelTester() 123 .mr(1) 124 .nr(4) 125 .kr(8) 126 .sr(1) 127 .m(1) 128 .n(4) 129 .k(k) 130 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 131 } 132 } 133 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_lt_8_strided_a)134 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_lt_8_strided_a) { 135 TEST_REQUIRES_ARM_NEON_FMA; 136 for (size_t k = 1; k < 8; k++) { 137 GemmMicrokernelTester() 138 .mr(1) 139 .nr(4) 140 .kr(8) 141 .sr(1) 142 .m(1) 143 .n(4) 144 .k(k) 145 .a_stride(11) 146 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 147 } 148 } 149 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_lt_8_subtile)150 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_lt_8_subtile) { 151 TEST_REQUIRES_ARM_NEON_FMA; 152 for (size_t k = 1; k < 8; k++) { 153 for (uint32_t n = 1; n <= 4; n++) { 154 for (uint32_t m = 1; m <= 1; m++) { 155 GemmMicrokernelTester() 156 .mr(1) 157 .nr(4) 158 .kr(8) 159 .sr(1) 160 .m(m) 161 .n(n) 162 .k(k) 163 .iterations(1) 164 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 165 } 166 } 167 } 168 } 169 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_gt_8)170 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_gt_8) { 171 TEST_REQUIRES_ARM_NEON_FMA; 172 for (size_t k = 9; k < 16; k++) { 173 GemmMicrokernelTester() 174 .mr(1) 175 .nr(4) 176 .kr(8) 177 .sr(1) 178 .m(1) 179 .n(4) 180 .k(k) 181 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 182 } 183 } 184 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_gt_8_strided_a)185 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_gt_8_strided_a) { 186 TEST_REQUIRES_ARM_NEON_FMA; 187 for (size_t k = 9; k < 16; k++) { 188 GemmMicrokernelTester() 189 .mr(1) 190 .nr(4) 191 .kr(8) 192 .sr(1) 193 .m(1) 194 .n(4) 195 .k(k) 196 .a_stride(19) 197 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 198 } 199 } 200 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_gt_8_subtile)201 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_gt_8_subtile) { 202 TEST_REQUIRES_ARM_NEON_FMA; 203 for (size_t k = 9; k < 16; k++) { 204 for (uint32_t n = 1; n <= 4; n++) { 205 for (uint32_t m = 1; m <= 1; m++) { 206 GemmMicrokernelTester() 207 .mr(1) 208 .nr(4) 209 .kr(8) 210 .sr(1) 211 .m(m) 212 .n(n) 213 .k(k) 214 .iterations(1) 215 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 216 } 217 } 218 } 219 } 220 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_div_8)221 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_div_8) { 222 TEST_REQUIRES_ARM_NEON_FMA; 223 for (size_t k = 16; k <= 80; k += 8) { 224 GemmMicrokernelTester() 225 .mr(1) 226 .nr(4) 227 .kr(8) 228 .sr(1) 229 .m(1) 230 .n(4) 231 .k(k) 232 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 233 } 234 } 235 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_div_8_strided_a)236 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_div_8_strided_a) { 237 TEST_REQUIRES_ARM_NEON_FMA; 238 for (size_t k = 16; k <= 80; k += 8) { 239 GemmMicrokernelTester() 240 .mr(1) 241 .nr(4) 242 .kr(8) 243 .sr(1) 244 .m(1) 245 .n(4) 246 .k(k) 247 .a_stride(83) 248 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 249 } 250 } 251 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,k_div_8_subtile)252 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, k_div_8_subtile) { 253 TEST_REQUIRES_ARM_NEON_FMA; 254 for (size_t k = 16; k <= 80; k += 8) { 255 for (uint32_t n = 1; n <= 4; n++) { 256 for (uint32_t m = 1; m <= 1; m++) { 257 GemmMicrokernelTester() 258 .mr(1) 259 .nr(4) 260 .kr(8) 261 .sr(1) 262 .m(m) 263 .n(n) 264 .k(k) 265 .iterations(1) 266 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 267 } 268 } 269 } 270 } 271 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,n_gt_4)272 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, n_gt_4) { 273 TEST_REQUIRES_ARM_NEON_FMA; 274 for (uint32_t n = 5; n < 8; n++) { 275 for (size_t k = 1; k <= 40; k += 9) { 276 GemmMicrokernelTester() 277 .mr(1) 278 .nr(4) 279 .kr(8) 280 .sr(1) 281 .m(1) 282 .n(n) 283 .k(k) 284 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 285 } 286 } 287 } 288 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,n_gt_4_strided_cn)289 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, n_gt_4_strided_cn) { 290 TEST_REQUIRES_ARM_NEON_FMA; 291 for (uint32_t n = 5; n < 8; n++) { 292 for (size_t k = 1; k <= 40; k += 9) { 293 GemmMicrokernelTester() 294 .mr(1) 295 .nr(4) 296 .kr(8) 297 .sr(1) 298 .m(1) 299 .n(n) 300 .k(k) 301 .cn_stride(7) 302 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 303 } 304 } 305 } 306 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,n_gt_4_strided_a)307 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, n_gt_4_strided_a) { 308 TEST_REQUIRES_ARM_NEON_FMA; 309 for (uint32_t n = 5; n < 8; n++) { 310 for (size_t k = 1; k <= 40; k += 9) { 311 GemmMicrokernelTester() 312 .mr(1) 313 .nr(4) 314 .kr(8) 315 .sr(1) 316 .m(1) 317 .n(n) 318 .k(k) 319 .a_stride(43) 320 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 321 } 322 } 323 } 324 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,n_gt_4_subtile)325 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, n_gt_4_subtile) { 326 TEST_REQUIRES_ARM_NEON_FMA; 327 for (uint32_t n = 5; n < 8; n++) { 328 for (size_t k = 1; k <= 40; k += 9) { 329 for (uint32_t m = 1; m <= 1; m++) { 330 GemmMicrokernelTester() 331 .mr(1) 332 .nr(4) 333 .kr(8) 334 .sr(1) 335 .m(m) 336 .n(n) 337 .k(k) 338 .iterations(1) 339 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 340 } 341 } 342 } 343 } 344 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,n_div_4)345 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, n_div_4) { 346 TEST_REQUIRES_ARM_NEON_FMA; 347 for (uint32_t n = 8; n <= 12; n += 4) { 348 for (size_t k = 1; k <= 40; k += 9) { 349 GemmMicrokernelTester() 350 .mr(1) 351 .nr(4) 352 .kr(8) 353 .sr(1) 354 .m(1) 355 .n(n) 356 .k(k) 357 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 358 } 359 } 360 } 361 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,n_div_4_strided_cn)362 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, n_div_4_strided_cn) { 363 TEST_REQUIRES_ARM_NEON_FMA; 364 for (uint32_t n = 8; n <= 12; n += 4) { 365 for (size_t k = 1; k <= 40; k += 9) { 366 GemmMicrokernelTester() 367 .mr(1) 368 .nr(4) 369 .kr(8) 370 .sr(1) 371 .m(1) 372 .n(n) 373 .k(k) 374 .cn_stride(7) 375 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 376 } 377 } 378 } 379 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,n_div_4_strided_a)380 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, n_div_4_strided_a) { 381 TEST_REQUIRES_ARM_NEON_FMA; 382 for (uint32_t n = 8; n <= 12; n += 4) { 383 for (size_t k = 1; k <= 40; k += 9) { 384 GemmMicrokernelTester() 385 .mr(1) 386 .nr(4) 387 .kr(8) 388 .sr(1) 389 .m(1) 390 .n(n) 391 .k(k) 392 .a_stride(43) 393 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 394 } 395 } 396 } 397 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,n_div_4_subtile)398 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, n_div_4_subtile) { 399 TEST_REQUIRES_ARM_NEON_FMA; 400 for (uint32_t n = 8; n <= 12; n += 4) { 401 for (size_t k = 1; k <= 40; k += 9) { 402 for (uint32_t m = 1; m <= 1; m++) { 403 GemmMicrokernelTester() 404 .mr(1) 405 .nr(4) 406 .kr(8) 407 .sr(1) 408 .m(m) 409 .n(n) 410 .k(k) 411 .iterations(1) 412 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 413 } 414 } 415 } 416 } 417 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,strided_cm_subtile)418 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, strided_cm_subtile) { 419 TEST_REQUIRES_ARM_NEON_FMA; 420 for (size_t k = 1; k <= 40; k += 9) { 421 for (uint32_t n = 1; n <= 4; n++) { 422 for (uint32_t m = 1; m <= 1; m++) { 423 GemmMicrokernelTester() 424 .mr(1) 425 .nr(4) 426 .kr(8) 427 .sr(1) 428 .m(m) 429 .n(n) 430 .k(k) 431 .cm_stride(7) 432 .iterations(1) 433 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 434 } 435 } 436 } 437 } 438 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,qmin)439 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, qmin) { 440 TEST_REQUIRES_ARM_NEON_FMA; 441 GemmMicrokernelTester() 442 .mr(1) 443 .nr(4) 444 .kr(8) 445 .sr(1) 446 .m(1) 447 .n(4) 448 .k(8) 449 .qmin(128) 450 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 451 } 452 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,qmax)453 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, qmax) { 454 TEST_REQUIRES_ARM_NEON_FMA; 455 GemmMicrokernelTester() 456 .mr(1) 457 .nr(4) 458 .kr(8) 459 .sr(1) 460 .m(1) 461 .n(4) 462 .k(8) 463 .qmax(128) 464 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 465 } 466 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND,strided_cm)467 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_SHLAND, strided_cm) { 468 TEST_REQUIRES_ARM_NEON_FMA; 469 GemmMicrokernelTester() 470 .mr(1) 471 .nr(4) 472 .kr(8) 473 .sr(1) 474 .m(1) 475 .n(4) 476 .k(8) 477 .cm_stride(7) 478 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 479 } 480 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 481 482 483 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_eq_8)484 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_eq_8) { 485 TEST_REQUIRES_ARM_NEON_FMA; 486 GemmMicrokernelTester() 487 .mr(2) 488 .nr(4) 489 .kr(8) 490 .sr(1) 491 .m(2) 492 .n(4) 493 .k(8) 494 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 495 } 496 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,strided_cn)497 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, strided_cn) { 498 TEST_REQUIRES_ARM_NEON_FMA; 499 GemmMicrokernelTester() 500 .mr(2) 501 .nr(4) 502 .kr(8) 503 .sr(1) 504 .m(2) 505 .n(4) 506 .k(8) 507 .cn_stride(7) 508 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 509 } 510 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_eq_8_strided_a)511 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_eq_8_strided_a) { 512 TEST_REQUIRES_ARM_NEON_FMA; 513 GemmMicrokernelTester() 514 .mr(2) 515 .nr(4) 516 .kr(8) 517 .sr(1) 518 .m(2) 519 .n(4) 520 .k(8) 521 .a_stride(11) 522 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 523 } 524 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_eq_8_subtile)525 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_eq_8_subtile) { 526 TEST_REQUIRES_ARM_NEON_FMA; 527 for (uint32_t n = 1; n <= 4; n++) { 528 for (uint32_t m = 1; m <= 2; m++) { 529 GemmMicrokernelTester() 530 .mr(2) 531 .nr(4) 532 .kr(8) 533 .sr(1) 534 .m(m) 535 .n(n) 536 .k(8) 537 .iterations(1) 538 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 539 } 540 } 541 } 542 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_eq_8_subtile_m)543 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_eq_8_subtile_m) { 544 TEST_REQUIRES_ARM_NEON_FMA; 545 for (uint32_t m = 1; m <= 2; m++) { 546 GemmMicrokernelTester() 547 .mr(2) 548 .nr(4) 549 .kr(8) 550 .sr(1) 551 .m(m) 552 .n(4) 553 .k(8) 554 .iterations(1) 555 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 556 } 557 } 558 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_eq_8_subtile_n)559 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_eq_8_subtile_n) { 560 TEST_REQUIRES_ARM_NEON_FMA; 561 for (uint32_t n = 1; n <= 4; n++) { 562 GemmMicrokernelTester() 563 .mr(2) 564 .nr(4) 565 .kr(8) 566 .sr(1) 567 .m(2) 568 .n(n) 569 .k(8) 570 .iterations(1) 571 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 572 } 573 } 574 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_lt_8)575 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_lt_8) { 576 TEST_REQUIRES_ARM_NEON_FMA; 577 for (size_t k = 1; k < 8; k++) { 578 GemmMicrokernelTester() 579 .mr(2) 580 .nr(4) 581 .kr(8) 582 .sr(1) 583 .m(2) 584 .n(4) 585 .k(k) 586 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 587 } 588 } 589 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_lt_8_strided_a)590 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_lt_8_strided_a) { 591 TEST_REQUIRES_ARM_NEON_FMA; 592 for (size_t k = 1; k < 8; k++) { 593 GemmMicrokernelTester() 594 .mr(2) 595 .nr(4) 596 .kr(8) 597 .sr(1) 598 .m(2) 599 .n(4) 600 .k(k) 601 .a_stride(11) 602 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 603 } 604 } 605 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_lt_8_subtile)606 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_lt_8_subtile) { 607 TEST_REQUIRES_ARM_NEON_FMA; 608 for (size_t k = 1; k < 8; k++) { 609 for (uint32_t n = 1; n <= 4; n++) { 610 for (uint32_t m = 1; m <= 2; m++) { 611 GemmMicrokernelTester() 612 .mr(2) 613 .nr(4) 614 .kr(8) 615 .sr(1) 616 .m(m) 617 .n(n) 618 .k(k) 619 .iterations(1) 620 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 621 } 622 } 623 } 624 } 625 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_gt_8)626 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_gt_8) { 627 TEST_REQUIRES_ARM_NEON_FMA; 628 for (size_t k = 9; k < 16; k++) { 629 GemmMicrokernelTester() 630 .mr(2) 631 .nr(4) 632 .kr(8) 633 .sr(1) 634 .m(2) 635 .n(4) 636 .k(k) 637 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 638 } 639 } 640 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_gt_8_strided_a)641 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_gt_8_strided_a) { 642 TEST_REQUIRES_ARM_NEON_FMA; 643 for (size_t k = 9; k < 16; k++) { 644 GemmMicrokernelTester() 645 .mr(2) 646 .nr(4) 647 .kr(8) 648 .sr(1) 649 .m(2) 650 .n(4) 651 .k(k) 652 .a_stride(19) 653 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 654 } 655 } 656 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_gt_8_subtile)657 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_gt_8_subtile) { 658 TEST_REQUIRES_ARM_NEON_FMA; 659 for (size_t k = 9; k < 16; k++) { 660 for (uint32_t n = 1; n <= 4; n++) { 661 for (uint32_t m = 1; m <= 2; m++) { 662 GemmMicrokernelTester() 663 .mr(2) 664 .nr(4) 665 .kr(8) 666 .sr(1) 667 .m(m) 668 .n(n) 669 .k(k) 670 .iterations(1) 671 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 672 } 673 } 674 } 675 } 676 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_div_8)677 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_div_8) { 678 TEST_REQUIRES_ARM_NEON_FMA; 679 for (size_t k = 16; k <= 80; k += 8) { 680 GemmMicrokernelTester() 681 .mr(2) 682 .nr(4) 683 .kr(8) 684 .sr(1) 685 .m(2) 686 .n(4) 687 .k(k) 688 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 689 } 690 } 691 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_div_8_strided_a)692 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_div_8_strided_a) { 693 TEST_REQUIRES_ARM_NEON_FMA; 694 for (size_t k = 16; k <= 80; k += 8) { 695 GemmMicrokernelTester() 696 .mr(2) 697 .nr(4) 698 .kr(8) 699 .sr(1) 700 .m(2) 701 .n(4) 702 .k(k) 703 .a_stride(83) 704 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 705 } 706 } 707 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,k_div_8_subtile)708 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, k_div_8_subtile) { 709 TEST_REQUIRES_ARM_NEON_FMA; 710 for (size_t k = 16; k <= 80; k += 8) { 711 for (uint32_t n = 1; n <= 4; n++) { 712 for (uint32_t m = 1; m <= 2; m++) { 713 GemmMicrokernelTester() 714 .mr(2) 715 .nr(4) 716 .kr(8) 717 .sr(1) 718 .m(m) 719 .n(n) 720 .k(k) 721 .iterations(1) 722 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 723 } 724 } 725 } 726 } 727 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,n_gt_4)728 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, n_gt_4) { 729 TEST_REQUIRES_ARM_NEON_FMA; 730 for (uint32_t n = 5; n < 8; n++) { 731 for (size_t k = 1; k <= 40; k += 9) { 732 GemmMicrokernelTester() 733 .mr(2) 734 .nr(4) 735 .kr(8) 736 .sr(1) 737 .m(2) 738 .n(n) 739 .k(k) 740 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 741 } 742 } 743 } 744 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,n_gt_4_strided_cn)745 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, n_gt_4_strided_cn) { 746 TEST_REQUIRES_ARM_NEON_FMA; 747 for (uint32_t n = 5; n < 8; n++) { 748 for (size_t k = 1; k <= 40; k += 9) { 749 GemmMicrokernelTester() 750 .mr(2) 751 .nr(4) 752 .kr(8) 753 .sr(1) 754 .m(2) 755 .n(n) 756 .k(k) 757 .cn_stride(7) 758 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 759 } 760 } 761 } 762 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,n_gt_4_strided_a)763 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, n_gt_4_strided_a) { 764 TEST_REQUIRES_ARM_NEON_FMA; 765 for (uint32_t n = 5; n < 8; n++) { 766 for (size_t k = 1; k <= 40; k += 9) { 767 GemmMicrokernelTester() 768 .mr(2) 769 .nr(4) 770 .kr(8) 771 .sr(1) 772 .m(2) 773 .n(n) 774 .k(k) 775 .a_stride(43) 776 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 777 } 778 } 779 } 780 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,n_gt_4_subtile)781 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, n_gt_4_subtile) { 782 TEST_REQUIRES_ARM_NEON_FMA; 783 for (uint32_t n = 5; n < 8; n++) { 784 for (size_t k = 1; k <= 40; k += 9) { 785 for (uint32_t m = 1; m <= 2; m++) { 786 GemmMicrokernelTester() 787 .mr(2) 788 .nr(4) 789 .kr(8) 790 .sr(1) 791 .m(m) 792 .n(n) 793 .k(k) 794 .iterations(1) 795 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 796 } 797 } 798 } 799 } 800 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,n_div_4)801 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, n_div_4) { 802 TEST_REQUIRES_ARM_NEON_FMA; 803 for (uint32_t n = 8; n <= 12; n += 4) { 804 for (size_t k = 1; k <= 40; k += 9) { 805 GemmMicrokernelTester() 806 .mr(2) 807 .nr(4) 808 .kr(8) 809 .sr(1) 810 .m(2) 811 .n(n) 812 .k(k) 813 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 814 } 815 } 816 } 817 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,n_div_4_strided_cn)818 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, n_div_4_strided_cn) { 819 TEST_REQUIRES_ARM_NEON_FMA; 820 for (uint32_t n = 8; n <= 12; n += 4) { 821 for (size_t k = 1; k <= 40; k += 9) { 822 GemmMicrokernelTester() 823 .mr(2) 824 .nr(4) 825 .kr(8) 826 .sr(1) 827 .m(2) 828 .n(n) 829 .k(k) 830 .cn_stride(7) 831 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 832 } 833 } 834 } 835 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,n_div_4_strided_a)836 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, n_div_4_strided_a) { 837 TEST_REQUIRES_ARM_NEON_FMA; 838 for (uint32_t n = 8; n <= 12; n += 4) { 839 for (size_t k = 1; k <= 40; k += 9) { 840 GemmMicrokernelTester() 841 .mr(2) 842 .nr(4) 843 .kr(8) 844 .sr(1) 845 .m(2) 846 .n(n) 847 .k(k) 848 .a_stride(43) 849 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 850 } 851 } 852 } 853 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,n_div_4_subtile)854 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, n_div_4_subtile) { 855 TEST_REQUIRES_ARM_NEON_FMA; 856 for (uint32_t n = 8; n <= 12; n += 4) { 857 for (size_t k = 1; k <= 40; k += 9) { 858 for (uint32_t m = 1; m <= 2; m++) { 859 GemmMicrokernelTester() 860 .mr(2) 861 .nr(4) 862 .kr(8) 863 .sr(1) 864 .m(m) 865 .n(n) 866 .k(k) 867 .iterations(1) 868 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 869 } 870 } 871 } 872 } 873 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,strided_cm_subtile)874 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, strided_cm_subtile) { 875 TEST_REQUIRES_ARM_NEON_FMA; 876 for (size_t k = 1; k <= 40; k += 9) { 877 for (uint32_t n = 1; n <= 4; n++) { 878 for (uint32_t m = 1; m <= 2; m++) { 879 GemmMicrokernelTester() 880 .mr(2) 881 .nr(4) 882 .kr(8) 883 .sr(1) 884 .m(m) 885 .n(n) 886 .k(k) 887 .cm_stride(7) 888 .iterations(1) 889 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 890 } 891 } 892 } 893 } 894 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,qmin)895 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, qmin) { 896 TEST_REQUIRES_ARM_NEON_FMA; 897 GemmMicrokernelTester() 898 .mr(2) 899 .nr(4) 900 .kr(8) 901 .sr(1) 902 .m(2) 903 .n(4) 904 .k(8) 905 .qmin(128) 906 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 907 } 908 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,qmax)909 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, qmax) { 910 TEST_REQUIRES_ARM_NEON_FMA; 911 GemmMicrokernelTester() 912 .mr(2) 913 .nr(4) 914 .kr(8) 915 .sr(1) 916 .m(2) 917 .n(4) 918 .k(8) 919 .qmax(128) 920 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 921 } 922 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND,strided_cm)923 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_SHLAND, strided_cm) { 924 TEST_REQUIRES_ARM_NEON_FMA; 925 GemmMicrokernelTester() 926 .mr(2) 927 .nr(4) 928 .kr(8) 929 .sr(1) 930 .m(2) 931 .n(4) 932 .k(8) 933 .cm_stride(7) 934 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 935 } 936 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 937 938 939 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_eq_8)940 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_eq_8) { 941 TEST_REQUIRES_ARM_NEON_FMA; 942 GemmMicrokernelTester() 943 .mr(3) 944 .nr(4) 945 .kr(8) 946 .sr(1) 947 .m(3) 948 .n(4) 949 .k(8) 950 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 951 } 952 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,strided_cn)953 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, strided_cn) { 954 TEST_REQUIRES_ARM_NEON_FMA; 955 GemmMicrokernelTester() 956 .mr(3) 957 .nr(4) 958 .kr(8) 959 .sr(1) 960 .m(3) 961 .n(4) 962 .k(8) 963 .cn_stride(7) 964 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 965 } 966 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_eq_8_strided_a)967 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_eq_8_strided_a) { 968 TEST_REQUIRES_ARM_NEON_FMA; 969 GemmMicrokernelTester() 970 .mr(3) 971 .nr(4) 972 .kr(8) 973 .sr(1) 974 .m(3) 975 .n(4) 976 .k(8) 977 .a_stride(11) 978 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 979 } 980 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_eq_8_subtile)981 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_eq_8_subtile) { 982 TEST_REQUIRES_ARM_NEON_FMA; 983 for (uint32_t n = 1; n <= 4; n++) { 984 for (uint32_t m = 1; m <= 3; m++) { 985 GemmMicrokernelTester() 986 .mr(3) 987 .nr(4) 988 .kr(8) 989 .sr(1) 990 .m(m) 991 .n(n) 992 .k(8) 993 .iterations(1) 994 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 995 } 996 } 997 } 998 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_eq_8_subtile_m)999 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_eq_8_subtile_m) { 1000 TEST_REQUIRES_ARM_NEON_FMA; 1001 for (uint32_t m = 1; m <= 3; m++) { 1002 GemmMicrokernelTester() 1003 .mr(3) 1004 .nr(4) 1005 .kr(8) 1006 .sr(1) 1007 .m(m) 1008 .n(4) 1009 .k(8) 1010 .iterations(1) 1011 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1012 } 1013 } 1014 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_eq_8_subtile_n)1015 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_eq_8_subtile_n) { 1016 TEST_REQUIRES_ARM_NEON_FMA; 1017 for (uint32_t n = 1; n <= 4; n++) { 1018 GemmMicrokernelTester() 1019 .mr(3) 1020 .nr(4) 1021 .kr(8) 1022 .sr(1) 1023 .m(3) 1024 .n(n) 1025 .k(8) 1026 .iterations(1) 1027 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1028 } 1029 } 1030 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_lt_8)1031 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_lt_8) { 1032 TEST_REQUIRES_ARM_NEON_FMA; 1033 for (size_t k = 1; k < 8; k++) { 1034 GemmMicrokernelTester() 1035 .mr(3) 1036 .nr(4) 1037 .kr(8) 1038 .sr(1) 1039 .m(3) 1040 .n(4) 1041 .k(k) 1042 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1043 } 1044 } 1045 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_lt_8_strided_a)1046 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_lt_8_strided_a) { 1047 TEST_REQUIRES_ARM_NEON_FMA; 1048 for (size_t k = 1; k < 8; k++) { 1049 GemmMicrokernelTester() 1050 .mr(3) 1051 .nr(4) 1052 .kr(8) 1053 .sr(1) 1054 .m(3) 1055 .n(4) 1056 .k(k) 1057 .a_stride(11) 1058 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1059 } 1060 } 1061 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_lt_8_subtile)1062 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_lt_8_subtile) { 1063 TEST_REQUIRES_ARM_NEON_FMA; 1064 for (size_t k = 1; k < 8; k++) { 1065 for (uint32_t n = 1; n <= 4; n++) { 1066 for (uint32_t m = 1; m <= 3; m++) { 1067 GemmMicrokernelTester() 1068 .mr(3) 1069 .nr(4) 1070 .kr(8) 1071 .sr(1) 1072 .m(m) 1073 .n(n) 1074 .k(k) 1075 .iterations(1) 1076 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1077 } 1078 } 1079 } 1080 } 1081 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_gt_8)1082 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_gt_8) { 1083 TEST_REQUIRES_ARM_NEON_FMA; 1084 for (size_t k = 9; k < 16; k++) { 1085 GemmMicrokernelTester() 1086 .mr(3) 1087 .nr(4) 1088 .kr(8) 1089 .sr(1) 1090 .m(3) 1091 .n(4) 1092 .k(k) 1093 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1094 } 1095 } 1096 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_gt_8_strided_a)1097 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_gt_8_strided_a) { 1098 TEST_REQUIRES_ARM_NEON_FMA; 1099 for (size_t k = 9; k < 16; k++) { 1100 GemmMicrokernelTester() 1101 .mr(3) 1102 .nr(4) 1103 .kr(8) 1104 .sr(1) 1105 .m(3) 1106 .n(4) 1107 .k(k) 1108 .a_stride(19) 1109 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1110 } 1111 } 1112 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_gt_8_subtile)1113 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_gt_8_subtile) { 1114 TEST_REQUIRES_ARM_NEON_FMA; 1115 for (size_t k = 9; k < 16; k++) { 1116 for (uint32_t n = 1; n <= 4; n++) { 1117 for (uint32_t m = 1; m <= 3; m++) { 1118 GemmMicrokernelTester() 1119 .mr(3) 1120 .nr(4) 1121 .kr(8) 1122 .sr(1) 1123 .m(m) 1124 .n(n) 1125 .k(k) 1126 .iterations(1) 1127 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1128 } 1129 } 1130 } 1131 } 1132 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_div_8)1133 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_div_8) { 1134 TEST_REQUIRES_ARM_NEON_FMA; 1135 for (size_t k = 16; k <= 80; k += 8) { 1136 GemmMicrokernelTester() 1137 .mr(3) 1138 .nr(4) 1139 .kr(8) 1140 .sr(1) 1141 .m(3) 1142 .n(4) 1143 .k(k) 1144 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1145 } 1146 } 1147 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_div_8_strided_a)1148 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_div_8_strided_a) { 1149 TEST_REQUIRES_ARM_NEON_FMA; 1150 for (size_t k = 16; k <= 80; k += 8) { 1151 GemmMicrokernelTester() 1152 .mr(3) 1153 .nr(4) 1154 .kr(8) 1155 .sr(1) 1156 .m(3) 1157 .n(4) 1158 .k(k) 1159 .a_stride(83) 1160 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1161 } 1162 } 1163 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,k_div_8_subtile)1164 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, k_div_8_subtile) { 1165 TEST_REQUIRES_ARM_NEON_FMA; 1166 for (size_t k = 16; k <= 80; k += 8) { 1167 for (uint32_t n = 1; n <= 4; n++) { 1168 for (uint32_t m = 1; m <= 3; m++) { 1169 GemmMicrokernelTester() 1170 .mr(3) 1171 .nr(4) 1172 .kr(8) 1173 .sr(1) 1174 .m(m) 1175 .n(n) 1176 .k(k) 1177 .iterations(1) 1178 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1179 } 1180 } 1181 } 1182 } 1183 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,n_gt_4)1184 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, n_gt_4) { 1185 TEST_REQUIRES_ARM_NEON_FMA; 1186 for (uint32_t n = 5; n < 8; n++) { 1187 for (size_t k = 1; k <= 40; k += 9) { 1188 GemmMicrokernelTester() 1189 .mr(3) 1190 .nr(4) 1191 .kr(8) 1192 .sr(1) 1193 .m(3) 1194 .n(n) 1195 .k(k) 1196 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1197 } 1198 } 1199 } 1200 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,n_gt_4_strided_cn)1201 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, n_gt_4_strided_cn) { 1202 TEST_REQUIRES_ARM_NEON_FMA; 1203 for (uint32_t n = 5; n < 8; n++) { 1204 for (size_t k = 1; k <= 40; k += 9) { 1205 GemmMicrokernelTester() 1206 .mr(3) 1207 .nr(4) 1208 .kr(8) 1209 .sr(1) 1210 .m(3) 1211 .n(n) 1212 .k(k) 1213 .cn_stride(7) 1214 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1215 } 1216 } 1217 } 1218 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,n_gt_4_strided_a)1219 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, n_gt_4_strided_a) { 1220 TEST_REQUIRES_ARM_NEON_FMA; 1221 for (uint32_t n = 5; n < 8; n++) { 1222 for (size_t k = 1; k <= 40; k += 9) { 1223 GemmMicrokernelTester() 1224 .mr(3) 1225 .nr(4) 1226 .kr(8) 1227 .sr(1) 1228 .m(3) 1229 .n(n) 1230 .k(k) 1231 .a_stride(43) 1232 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1233 } 1234 } 1235 } 1236 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,n_gt_4_subtile)1237 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, n_gt_4_subtile) { 1238 TEST_REQUIRES_ARM_NEON_FMA; 1239 for (uint32_t n = 5; n < 8; n++) { 1240 for (size_t k = 1; k <= 40; k += 9) { 1241 for (uint32_t m = 1; m <= 3; m++) { 1242 GemmMicrokernelTester() 1243 .mr(3) 1244 .nr(4) 1245 .kr(8) 1246 .sr(1) 1247 .m(m) 1248 .n(n) 1249 .k(k) 1250 .iterations(1) 1251 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1252 } 1253 } 1254 } 1255 } 1256 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,n_div_4)1257 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, n_div_4) { 1258 TEST_REQUIRES_ARM_NEON_FMA; 1259 for (uint32_t n = 8; n <= 12; n += 4) { 1260 for (size_t k = 1; k <= 40; k += 9) { 1261 GemmMicrokernelTester() 1262 .mr(3) 1263 .nr(4) 1264 .kr(8) 1265 .sr(1) 1266 .m(3) 1267 .n(n) 1268 .k(k) 1269 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1270 } 1271 } 1272 } 1273 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,n_div_4_strided_cn)1274 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, n_div_4_strided_cn) { 1275 TEST_REQUIRES_ARM_NEON_FMA; 1276 for (uint32_t n = 8; n <= 12; n += 4) { 1277 for (size_t k = 1; k <= 40; k += 9) { 1278 GemmMicrokernelTester() 1279 .mr(3) 1280 .nr(4) 1281 .kr(8) 1282 .sr(1) 1283 .m(3) 1284 .n(n) 1285 .k(k) 1286 .cn_stride(7) 1287 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1288 } 1289 } 1290 } 1291 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,n_div_4_strided_a)1292 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, n_div_4_strided_a) { 1293 TEST_REQUIRES_ARM_NEON_FMA; 1294 for (uint32_t n = 8; n <= 12; n += 4) { 1295 for (size_t k = 1; k <= 40; k += 9) { 1296 GemmMicrokernelTester() 1297 .mr(3) 1298 .nr(4) 1299 .kr(8) 1300 .sr(1) 1301 .m(3) 1302 .n(n) 1303 .k(k) 1304 .a_stride(43) 1305 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1306 } 1307 } 1308 } 1309 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,n_div_4_subtile)1310 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, n_div_4_subtile) { 1311 TEST_REQUIRES_ARM_NEON_FMA; 1312 for (uint32_t n = 8; n <= 12; n += 4) { 1313 for (size_t k = 1; k <= 40; k += 9) { 1314 for (uint32_t m = 1; m <= 3; m++) { 1315 GemmMicrokernelTester() 1316 .mr(3) 1317 .nr(4) 1318 .kr(8) 1319 .sr(1) 1320 .m(m) 1321 .n(n) 1322 .k(k) 1323 .iterations(1) 1324 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1325 } 1326 } 1327 } 1328 } 1329 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,strided_cm_subtile)1330 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, strided_cm_subtile) { 1331 TEST_REQUIRES_ARM_NEON_FMA; 1332 for (size_t k = 1; k <= 40; k += 9) { 1333 for (uint32_t n = 1; n <= 4; n++) { 1334 for (uint32_t m = 1; m <= 3; m++) { 1335 GemmMicrokernelTester() 1336 .mr(3) 1337 .nr(4) 1338 .kr(8) 1339 .sr(1) 1340 .m(m) 1341 .n(n) 1342 .k(k) 1343 .cm_stride(7) 1344 .iterations(1) 1345 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1346 } 1347 } 1348 } 1349 } 1350 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,qmin)1351 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, qmin) { 1352 TEST_REQUIRES_ARM_NEON_FMA; 1353 GemmMicrokernelTester() 1354 .mr(3) 1355 .nr(4) 1356 .kr(8) 1357 .sr(1) 1358 .m(3) 1359 .n(4) 1360 .k(8) 1361 .qmin(128) 1362 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1363 } 1364 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,qmax)1365 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, qmax) { 1366 TEST_REQUIRES_ARM_NEON_FMA; 1367 GemmMicrokernelTester() 1368 .mr(3) 1369 .nr(4) 1370 .kr(8) 1371 .sr(1) 1372 .m(3) 1373 .n(4) 1374 .k(8) 1375 .qmax(128) 1376 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1377 } 1378 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND,strided_cm)1379 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_SHLAND, strided_cm) { 1380 TEST_REQUIRES_ARM_NEON_FMA; 1381 GemmMicrokernelTester() 1382 .mr(3) 1383 .nr(4) 1384 .kr(8) 1385 .sr(1) 1386 .m(3) 1387 .n(4) 1388 .k(8) 1389 .cm_stride(7) 1390 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1391 } 1392 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1393 1394 1395 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_eq_8)1396 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_eq_8) { 1397 TEST_REQUIRES_ARM_NEON_FMA; 1398 GemmMicrokernelTester() 1399 .mr(4) 1400 .nr(4) 1401 .kr(8) 1402 .sr(1) 1403 .m(4) 1404 .n(4) 1405 .k(8) 1406 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1407 } 1408 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,strided_cn)1409 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, strided_cn) { 1410 TEST_REQUIRES_ARM_NEON_FMA; 1411 GemmMicrokernelTester() 1412 .mr(4) 1413 .nr(4) 1414 .kr(8) 1415 .sr(1) 1416 .m(4) 1417 .n(4) 1418 .k(8) 1419 .cn_stride(7) 1420 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1421 } 1422 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_eq_8_strided_a)1423 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_eq_8_strided_a) { 1424 TEST_REQUIRES_ARM_NEON_FMA; 1425 GemmMicrokernelTester() 1426 .mr(4) 1427 .nr(4) 1428 .kr(8) 1429 .sr(1) 1430 .m(4) 1431 .n(4) 1432 .k(8) 1433 .a_stride(11) 1434 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1435 } 1436 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_eq_8_subtile)1437 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_eq_8_subtile) { 1438 TEST_REQUIRES_ARM_NEON_FMA; 1439 for (uint32_t n = 1; n <= 4; n++) { 1440 for (uint32_t m = 1; m <= 4; m++) { 1441 GemmMicrokernelTester() 1442 .mr(4) 1443 .nr(4) 1444 .kr(8) 1445 .sr(1) 1446 .m(m) 1447 .n(n) 1448 .k(8) 1449 .iterations(1) 1450 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1451 } 1452 } 1453 } 1454 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_eq_8_subtile_m)1455 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_eq_8_subtile_m) { 1456 TEST_REQUIRES_ARM_NEON_FMA; 1457 for (uint32_t m = 1; m <= 4; m++) { 1458 GemmMicrokernelTester() 1459 .mr(4) 1460 .nr(4) 1461 .kr(8) 1462 .sr(1) 1463 .m(m) 1464 .n(4) 1465 .k(8) 1466 .iterations(1) 1467 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1468 } 1469 } 1470 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_eq_8_subtile_n)1471 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_eq_8_subtile_n) { 1472 TEST_REQUIRES_ARM_NEON_FMA; 1473 for (uint32_t n = 1; n <= 4; n++) { 1474 GemmMicrokernelTester() 1475 .mr(4) 1476 .nr(4) 1477 .kr(8) 1478 .sr(1) 1479 .m(4) 1480 .n(n) 1481 .k(8) 1482 .iterations(1) 1483 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1484 } 1485 } 1486 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_lt_8)1487 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_lt_8) { 1488 TEST_REQUIRES_ARM_NEON_FMA; 1489 for (size_t k = 1; k < 8; k++) { 1490 GemmMicrokernelTester() 1491 .mr(4) 1492 .nr(4) 1493 .kr(8) 1494 .sr(1) 1495 .m(4) 1496 .n(4) 1497 .k(k) 1498 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1499 } 1500 } 1501 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_lt_8_strided_a)1502 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_lt_8_strided_a) { 1503 TEST_REQUIRES_ARM_NEON_FMA; 1504 for (size_t k = 1; k < 8; k++) { 1505 GemmMicrokernelTester() 1506 .mr(4) 1507 .nr(4) 1508 .kr(8) 1509 .sr(1) 1510 .m(4) 1511 .n(4) 1512 .k(k) 1513 .a_stride(11) 1514 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1515 } 1516 } 1517 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_lt_8_subtile)1518 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_lt_8_subtile) { 1519 TEST_REQUIRES_ARM_NEON_FMA; 1520 for (size_t k = 1; k < 8; k++) { 1521 for (uint32_t n = 1; n <= 4; n++) { 1522 for (uint32_t m = 1; m <= 4; m++) { 1523 GemmMicrokernelTester() 1524 .mr(4) 1525 .nr(4) 1526 .kr(8) 1527 .sr(1) 1528 .m(m) 1529 .n(n) 1530 .k(k) 1531 .iterations(1) 1532 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1533 } 1534 } 1535 } 1536 } 1537 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_gt_8)1538 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_gt_8) { 1539 TEST_REQUIRES_ARM_NEON_FMA; 1540 for (size_t k = 9; k < 16; k++) { 1541 GemmMicrokernelTester() 1542 .mr(4) 1543 .nr(4) 1544 .kr(8) 1545 .sr(1) 1546 .m(4) 1547 .n(4) 1548 .k(k) 1549 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1550 } 1551 } 1552 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_gt_8_strided_a)1553 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_gt_8_strided_a) { 1554 TEST_REQUIRES_ARM_NEON_FMA; 1555 for (size_t k = 9; k < 16; k++) { 1556 GemmMicrokernelTester() 1557 .mr(4) 1558 .nr(4) 1559 .kr(8) 1560 .sr(1) 1561 .m(4) 1562 .n(4) 1563 .k(k) 1564 .a_stride(19) 1565 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1566 } 1567 } 1568 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_gt_8_subtile)1569 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_gt_8_subtile) { 1570 TEST_REQUIRES_ARM_NEON_FMA; 1571 for (size_t k = 9; k < 16; k++) { 1572 for (uint32_t n = 1; n <= 4; n++) { 1573 for (uint32_t m = 1; m <= 4; m++) { 1574 GemmMicrokernelTester() 1575 .mr(4) 1576 .nr(4) 1577 .kr(8) 1578 .sr(1) 1579 .m(m) 1580 .n(n) 1581 .k(k) 1582 .iterations(1) 1583 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1584 } 1585 } 1586 } 1587 } 1588 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_div_8)1589 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_div_8) { 1590 TEST_REQUIRES_ARM_NEON_FMA; 1591 for (size_t k = 16; k <= 80; k += 8) { 1592 GemmMicrokernelTester() 1593 .mr(4) 1594 .nr(4) 1595 .kr(8) 1596 .sr(1) 1597 .m(4) 1598 .n(4) 1599 .k(k) 1600 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1601 } 1602 } 1603 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_div_8_strided_a)1604 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_div_8_strided_a) { 1605 TEST_REQUIRES_ARM_NEON_FMA; 1606 for (size_t k = 16; k <= 80; k += 8) { 1607 GemmMicrokernelTester() 1608 .mr(4) 1609 .nr(4) 1610 .kr(8) 1611 .sr(1) 1612 .m(4) 1613 .n(4) 1614 .k(k) 1615 .a_stride(83) 1616 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1617 } 1618 } 1619 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,k_div_8_subtile)1620 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, k_div_8_subtile) { 1621 TEST_REQUIRES_ARM_NEON_FMA; 1622 for (size_t k = 16; k <= 80; k += 8) { 1623 for (uint32_t n = 1; n <= 4; n++) { 1624 for (uint32_t m = 1; m <= 4; m++) { 1625 GemmMicrokernelTester() 1626 .mr(4) 1627 .nr(4) 1628 .kr(8) 1629 .sr(1) 1630 .m(m) 1631 .n(n) 1632 .k(k) 1633 .iterations(1) 1634 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1635 } 1636 } 1637 } 1638 } 1639 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,n_gt_4)1640 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, n_gt_4) { 1641 TEST_REQUIRES_ARM_NEON_FMA; 1642 for (uint32_t n = 5; n < 8; n++) { 1643 for (size_t k = 1; k <= 40; k += 9) { 1644 GemmMicrokernelTester() 1645 .mr(4) 1646 .nr(4) 1647 .kr(8) 1648 .sr(1) 1649 .m(4) 1650 .n(n) 1651 .k(k) 1652 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1653 } 1654 } 1655 } 1656 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,n_gt_4_strided_cn)1657 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, n_gt_4_strided_cn) { 1658 TEST_REQUIRES_ARM_NEON_FMA; 1659 for (uint32_t n = 5; n < 8; n++) { 1660 for (size_t k = 1; k <= 40; k += 9) { 1661 GemmMicrokernelTester() 1662 .mr(4) 1663 .nr(4) 1664 .kr(8) 1665 .sr(1) 1666 .m(4) 1667 .n(n) 1668 .k(k) 1669 .cn_stride(7) 1670 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1671 } 1672 } 1673 } 1674 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,n_gt_4_strided_a)1675 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, n_gt_4_strided_a) { 1676 TEST_REQUIRES_ARM_NEON_FMA; 1677 for (uint32_t n = 5; n < 8; n++) { 1678 for (size_t k = 1; k <= 40; k += 9) { 1679 GemmMicrokernelTester() 1680 .mr(4) 1681 .nr(4) 1682 .kr(8) 1683 .sr(1) 1684 .m(4) 1685 .n(n) 1686 .k(k) 1687 .a_stride(43) 1688 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1689 } 1690 } 1691 } 1692 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,n_gt_4_subtile)1693 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, n_gt_4_subtile) { 1694 TEST_REQUIRES_ARM_NEON_FMA; 1695 for (uint32_t n = 5; n < 8; n++) { 1696 for (size_t k = 1; k <= 40; k += 9) { 1697 for (uint32_t m = 1; m <= 4; m++) { 1698 GemmMicrokernelTester() 1699 .mr(4) 1700 .nr(4) 1701 .kr(8) 1702 .sr(1) 1703 .m(m) 1704 .n(n) 1705 .k(k) 1706 .iterations(1) 1707 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1708 } 1709 } 1710 } 1711 } 1712 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,n_div_4)1713 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, n_div_4) { 1714 TEST_REQUIRES_ARM_NEON_FMA; 1715 for (uint32_t n = 8; n <= 12; n += 4) { 1716 for (size_t k = 1; k <= 40; k += 9) { 1717 GemmMicrokernelTester() 1718 .mr(4) 1719 .nr(4) 1720 .kr(8) 1721 .sr(1) 1722 .m(4) 1723 .n(n) 1724 .k(k) 1725 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1726 } 1727 } 1728 } 1729 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,n_div_4_strided_cn)1730 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, n_div_4_strided_cn) { 1731 TEST_REQUIRES_ARM_NEON_FMA; 1732 for (uint32_t n = 8; n <= 12; n += 4) { 1733 for (size_t k = 1; k <= 40; k += 9) { 1734 GemmMicrokernelTester() 1735 .mr(4) 1736 .nr(4) 1737 .kr(8) 1738 .sr(1) 1739 .m(4) 1740 .n(n) 1741 .k(k) 1742 .cn_stride(7) 1743 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1744 } 1745 } 1746 } 1747 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,n_div_4_strided_a)1748 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, n_div_4_strided_a) { 1749 TEST_REQUIRES_ARM_NEON_FMA; 1750 for (uint32_t n = 8; n <= 12; n += 4) { 1751 for (size_t k = 1; k <= 40; k += 9) { 1752 GemmMicrokernelTester() 1753 .mr(4) 1754 .nr(4) 1755 .kr(8) 1756 .sr(1) 1757 .m(4) 1758 .n(n) 1759 .k(k) 1760 .a_stride(43) 1761 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1762 } 1763 } 1764 } 1765 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,n_div_4_subtile)1766 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, n_div_4_subtile) { 1767 TEST_REQUIRES_ARM_NEON_FMA; 1768 for (uint32_t n = 8; n <= 12; n += 4) { 1769 for (size_t k = 1; k <= 40; k += 9) { 1770 for (uint32_t m = 1; m <= 4; m++) { 1771 GemmMicrokernelTester() 1772 .mr(4) 1773 .nr(4) 1774 .kr(8) 1775 .sr(1) 1776 .m(m) 1777 .n(n) 1778 .k(k) 1779 .iterations(1) 1780 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1781 } 1782 } 1783 } 1784 } 1785 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,strided_cm_subtile)1786 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, strided_cm_subtile) { 1787 TEST_REQUIRES_ARM_NEON_FMA; 1788 for (size_t k = 1; k <= 40; k += 9) { 1789 for (uint32_t n = 1; n <= 4; n++) { 1790 for (uint32_t m = 1; m <= 4; m++) { 1791 GemmMicrokernelTester() 1792 .mr(4) 1793 .nr(4) 1794 .kr(8) 1795 .sr(1) 1796 .m(m) 1797 .n(n) 1798 .k(k) 1799 .cm_stride(7) 1800 .iterations(1) 1801 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1802 } 1803 } 1804 } 1805 } 1806 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,qmin)1807 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, qmin) { 1808 TEST_REQUIRES_ARM_NEON_FMA; 1809 GemmMicrokernelTester() 1810 .mr(4) 1811 .nr(4) 1812 .kr(8) 1813 .sr(1) 1814 .m(4) 1815 .n(4) 1816 .k(8) 1817 .qmin(128) 1818 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1819 } 1820 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,qmax)1821 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, qmax) { 1822 TEST_REQUIRES_ARM_NEON_FMA; 1823 GemmMicrokernelTester() 1824 .mr(4) 1825 .nr(4) 1826 .kr(8) 1827 .sr(1) 1828 .m(4) 1829 .n(4) 1830 .k(8) 1831 .qmax(128) 1832 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1833 } 1834 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND,strided_cm)1835 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_SHLAND, strided_cm) { 1836 TEST_REQUIRES_ARM_NEON_FMA; 1837 GemmMicrokernelTester() 1838 .mr(4) 1839 .nr(4) 1840 .kr(8) 1841 .sr(1) 1842 .m(4) 1843 .n(4) 1844 .k(8) 1845 .cm_stride(7) 1846 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1847 } 1848 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1849 1850 1851 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_eq_8)1852 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_eq_8) { 1853 TEST_REQUIRES_ARM_NEON_FMA; 1854 GemmMicrokernelTester() 1855 .mr(5) 1856 .nr(4) 1857 .kr(8) 1858 .sr(1) 1859 .m(5) 1860 .n(4) 1861 .k(8) 1862 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1863 } 1864 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,strided_cn)1865 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, strided_cn) { 1866 TEST_REQUIRES_ARM_NEON_FMA; 1867 GemmMicrokernelTester() 1868 .mr(5) 1869 .nr(4) 1870 .kr(8) 1871 .sr(1) 1872 .m(5) 1873 .n(4) 1874 .k(8) 1875 .cn_stride(7) 1876 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1877 } 1878 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_eq_8_strided_a)1879 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_eq_8_strided_a) { 1880 TEST_REQUIRES_ARM_NEON_FMA; 1881 GemmMicrokernelTester() 1882 .mr(5) 1883 .nr(4) 1884 .kr(8) 1885 .sr(1) 1886 .m(5) 1887 .n(4) 1888 .k(8) 1889 .a_stride(11) 1890 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1891 } 1892 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_eq_8_subtile)1893 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_eq_8_subtile) { 1894 TEST_REQUIRES_ARM_NEON_FMA; 1895 for (uint32_t n = 1; n <= 4; n++) { 1896 for (uint32_t m = 1; m <= 5; m++) { 1897 GemmMicrokernelTester() 1898 .mr(5) 1899 .nr(4) 1900 .kr(8) 1901 .sr(1) 1902 .m(m) 1903 .n(n) 1904 .k(8) 1905 .iterations(1) 1906 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1907 } 1908 } 1909 } 1910 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_eq_8_subtile_m)1911 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_eq_8_subtile_m) { 1912 TEST_REQUIRES_ARM_NEON_FMA; 1913 for (uint32_t m = 1; m <= 5; m++) { 1914 GemmMicrokernelTester() 1915 .mr(5) 1916 .nr(4) 1917 .kr(8) 1918 .sr(1) 1919 .m(m) 1920 .n(4) 1921 .k(8) 1922 .iterations(1) 1923 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1924 } 1925 } 1926 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_eq_8_subtile_n)1927 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_eq_8_subtile_n) { 1928 TEST_REQUIRES_ARM_NEON_FMA; 1929 for (uint32_t n = 1; n <= 4; n++) { 1930 GemmMicrokernelTester() 1931 .mr(5) 1932 .nr(4) 1933 .kr(8) 1934 .sr(1) 1935 .m(5) 1936 .n(n) 1937 .k(8) 1938 .iterations(1) 1939 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1940 } 1941 } 1942 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_lt_8)1943 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_lt_8) { 1944 TEST_REQUIRES_ARM_NEON_FMA; 1945 for (size_t k = 1; k < 8; k++) { 1946 GemmMicrokernelTester() 1947 .mr(5) 1948 .nr(4) 1949 .kr(8) 1950 .sr(1) 1951 .m(5) 1952 .n(4) 1953 .k(k) 1954 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1955 } 1956 } 1957 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_lt_8_strided_a)1958 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_lt_8_strided_a) { 1959 TEST_REQUIRES_ARM_NEON_FMA; 1960 for (size_t k = 1; k < 8; k++) { 1961 GemmMicrokernelTester() 1962 .mr(5) 1963 .nr(4) 1964 .kr(8) 1965 .sr(1) 1966 .m(5) 1967 .n(4) 1968 .k(k) 1969 .a_stride(11) 1970 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1971 } 1972 } 1973 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_lt_8_subtile)1974 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_lt_8_subtile) { 1975 TEST_REQUIRES_ARM_NEON_FMA; 1976 for (size_t k = 1; k < 8; k++) { 1977 for (uint32_t n = 1; n <= 4; n++) { 1978 for (uint32_t m = 1; m <= 5; m++) { 1979 GemmMicrokernelTester() 1980 .mr(5) 1981 .nr(4) 1982 .kr(8) 1983 .sr(1) 1984 .m(m) 1985 .n(n) 1986 .k(k) 1987 .iterations(1) 1988 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 1989 } 1990 } 1991 } 1992 } 1993 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_gt_8)1994 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_gt_8) { 1995 TEST_REQUIRES_ARM_NEON_FMA; 1996 for (size_t k = 9; k < 16; k++) { 1997 GemmMicrokernelTester() 1998 .mr(5) 1999 .nr(4) 2000 .kr(8) 2001 .sr(1) 2002 .m(5) 2003 .n(4) 2004 .k(k) 2005 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2006 } 2007 } 2008 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_gt_8_strided_a)2009 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_gt_8_strided_a) { 2010 TEST_REQUIRES_ARM_NEON_FMA; 2011 for (size_t k = 9; k < 16; k++) { 2012 GemmMicrokernelTester() 2013 .mr(5) 2014 .nr(4) 2015 .kr(8) 2016 .sr(1) 2017 .m(5) 2018 .n(4) 2019 .k(k) 2020 .a_stride(19) 2021 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2022 } 2023 } 2024 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_gt_8_subtile)2025 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_gt_8_subtile) { 2026 TEST_REQUIRES_ARM_NEON_FMA; 2027 for (size_t k = 9; k < 16; k++) { 2028 for (uint32_t n = 1; n <= 4; n++) { 2029 for (uint32_t m = 1; m <= 5; m++) { 2030 GemmMicrokernelTester() 2031 .mr(5) 2032 .nr(4) 2033 .kr(8) 2034 .sr(1) 2035 .m(m) 2036 .n(n) 2037 .k(k) 2038 .iterations(1) 2039 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2040 } 2041 } 2042 } 2043 } 2044 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_div_8)2045 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_div_8) { 2046 TEST_REQUIRES_ARM_NEON_FMA; 2047 for (size_t k = 16; k <= 80; k += 8) { 2048 GemmMicrokernelTester() 2049 .mr(5) 2050 .nr(4) 2051 .kr(8) 2052 .sr(1) 2053 .m(5) 2054 .n(4) 2055 .k(k) 2056 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2057 } 2058 } 2059 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_div_8_strided_a)2060 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_div_8_strided_a) { 2061 TEST_REQUIRES_ARM_NEON_FMA; 2062 for (size_t k = 16; k <= 80; k += 8) { 2063 GemmMicrokernelTester() 2064 .mr(5) 2065 .nr(4) 2066 .kr(8) 2067 .sr(1) 2068 .m(5) 2069 .n(4) 2070 .k(k) 2071 .a_stride(83) 2072 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2073 } 2074 } 2075 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,k_div_8_subtile)2076 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, k_div_8_subtile) { 2077 TEST_REQUIRES_ARM_NEON_FMA; 2078 for (size_t k = 16; k <= 80; k += 8) { 2079 for (uint32_t n = 1; n <= 4; n++) { 2080 for (uint32_t m = 1; m <= 5; m++) { 2081 GemmMicrokernelTester() 2082 .mr(5) 2083 .nr(4) 2084 .kr(8) 2085 .sr(1) 2086 .m(m) 2087 .n(n) 2088 .k(k) 2089 .iterations(1) 2090 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2091 } 2092 } 2093 } 2094 } 2095 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,n_gt_4)2096 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, n_gt_4) { 2097 TEST_REQUIRES_ARM_NEON_FMA; 2098 for (uint32_t n = 5; n < 8; n++) { 2099 for (size_t k = 1; k <= 40; k += 9) { 2100 GemmMicrokernelTester() 2101 .mr(5) 2102 .nr(4) 2103 .kr(8) 2104 .sr(1) 2105 .m(5) 2106 .n(n) 2107 .k(k) 2108 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2109 } 2110 } 2111 } 2112 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,n_gt_4_strided_cn)2113 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, n_gt_4_strided_cn) { 2114 TEST_REQUIRES_ARM_NEON_FMA; 2115 for (uint32_t n = 5; n < 8; n++) { 2116 for (size_t k = 1; k <= 40; k += 9) { 2117 GemmMicrokernelTester() 2118 .mr(5) 2119 .nr(4) 2120 .kr(8) 2121 .sr(1) 2122 .m(5) 2123 .n(n) 2124 .k(k) 2125 .cn_stride(7) 2126 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2127 } 2128 } 2129 } 2130 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,n_gt_4_strided_a)2131 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, n_gt_4_strided_a) { 2132 TEST_REQUIRES_ARM_NEON_FMA; 2133 for (uint32_t n = 5; n < 8; n++) { 2134 for (size_t k = 1; k <= 40; k += 9) { 2135 GemmMicrokernelTester() 2136 .mr(5) 2137 .nr(4) 2138 .kr(8) 2139 .sr(1) 2140 .m(5) 2141 .n(n) 2142 .k(k) 2143 .a_stride(43) 2144 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2145 } 2146 } 2147 } 2148 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,n_gt_4_subtile)2149 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, n_gt_4_subtile) { 2150 TEST_REQUIRES_ARM_NEON_FMA; 2151 for (uint32_t n = 5; n < 8; n++) { 2152 for (size_t k = 1; k <= 40; k += 9) { 2153 for (uint32_t m = 1; m <= 5; m++) { 2154 GemmMicrokernelTester() 2155 .mr(5) 2156 .nr(4) 2157 .kr(8) 2158 .sr(1) 2159 .m(m) 2160 .n(n) 2161 .k(k) 2162 .iterations(1) 2163 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2164 } 2165 } 2166 } 2167 } 2168 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,n_div_4)2169 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, n_div_4) { 2170 TEST_REQUIRES_ARM_NEON_FMA; 2171 for (uint32_t n = 8; n <= 12; n += 4) { 2172 for (size_t k = 1; k <= 40; k += 9) { 2173 GemmMicrokernelTester() 2174 .mr(5) 2175 .nr(4) 2176 .kr(8) 2177 .sr(1) 2178 .m(5) 2179 .n(n) 2180 .k(k) 2181 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2182 } 2183 } 2184 } 2185 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,n_div_4_strided_cn)2186 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, n_div_4_strided_cn) { 2187 TEST_REQUIRES_ARM_NEON_FMA; 2188 for (uint32_t n = 8; n <= 12; n += 4) { 2189 for (size_t k = 1; k <= 40; k += 9) { 2190 GemmMicrokernelTester() 2191 .mr(5) 2192 .nr(4) 2193 .kr(8) 2194 .sr(1) 2195 .m(5) 2196 .n(n) 2197 .k(k) 2198 .cn_stride(7) 2199 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2200 } 2201 } 2202 } 2203 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,n_div_4_strided_a)2204 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, n_div_4_strided_a) { 2205 TEST_REQUIRES_ARM_NEON_FMA; 2206 for (uint32_t n = 8; n <= 12; n += 4) { 2207 for (size_t k = 1; k <= 40; k += 9) { 2208 GemmMicrokernelTester() 2209 .mr(5) 2210 .nr(4) 2211 .kr(8) 2212 .sr(1) 2213 .m(5) 2214 .n(n) 2215 .k(k) 2216 .a_stride(43) 2217 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2218 } 2219 } 2220 } 2221 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,n_div_4_subtile)2222 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, n_div_4_subtile) { 2223 TEST_REQUIRES_ARM_NEON_FMA; 2224 for (uint32_t n = 8; n <= 12; n += 4) { 2225 for (size_t k = 1; k <= 40; k += 9) { 2226 for (uint32_t m = 1; m <= 5; m++) { 2227 GemmMicrokernelTester() 2228 .mr(5) 2229 .nr(4) 2230 .kr(8) 2231 .sr(1) 2232 .m(m) 2233 .n(n) 2234 .k(k) 2235 .iterations(1) 2236 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2237 } 2238 } 2239 } 2240 } 2241 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,strided_cm_subtile)2242 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, strided_cm_subtile) { 2243 TEST_REQUIRES_ARM_NEON_FMA; 2244 for (size_t k = 1; k <= 40; k += 9) { 2245 for (uint32_t n = 1; n <= 4; n++) { 2246 for (uint32_t m = 1; m <= 5; m++) { 2247 GemmMicrokernelTester() 2248 .mr(5) 2249 .nr(4) 2250 .kr(8) 2251 .sr(1) 2252 .m(m) 2253 .n(n) 2254 .k(k) 2255 .cm_stride(7) 2256 .iterations(1) 2257 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2258 } 2259 } 2260 } 2261 } 2262 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,qmin)2263 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, qmin) { 2264 TEST_REQUIRES_ARM_NEON_FMA; 2265 GemmMicrokernelTester() 2266 .mr(5) 2267 .nr(4) 2268 .kr(8) 2269 .sr(1) 2270 .m(5) 2271 .n(4) 2272 .k(8) 2273 .qmin(128) 2274 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2275 } 2276 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,qmax)2277 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, qmax) { 2278 TEST_REQUIRES_ARM_NEON_FMA; 2279 GemmMicrokernelTester() 2280 .mr(5) 2281 .nr(4) 2282 .kr(8) 2283 .sr(1) 2284 .m(5) 2285 .n(4) 2286 .k(8) 2287 .qmax(128) 2288 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2289 } 2290 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND,strided_cm)2291 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_SHLAND, strided_cm) { 2292 TEST_REQUIRES_ARM_NEON_FMA; 2293 GemmMicrokernelTester() 2294 .mr(5) 2295 .nr(4) 2296 .kr(8) 2297 .sr(1) 2298 .m(5) 2299 .n(4) 2300 .k(8) 2301 .cm_stride(7) 2302 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_shland, xnn_init_bf16_minmax_scalar_params); 2303 } 2304 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 2305 2306 2307 #if XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_eq_8)2308 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_eq_8) { 2309 TEST_REQUIRES_ARM_NEON_FMA; 2310 GemmMicrokernelTester() 2311 .mr(1) 2312 .nr(4) 2313 .kr(8) 2314 .sr(1) 2315 .m(1) 2316 .n(4) 2317 .k(8) 2318 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2319 } 2320 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,strided_cn)2321 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, strided_cn) { 2322 TEST_REQUIRES_ARM_NEON_FMA; 2323 GemmMicrokernelTester() 2324 .mr(1) 2325 .nr(4) 2326 .kr(8) 2327 .sr(1) 2328 .m(1) 2329 .n(4) 2330 .k(8) 2331 .cn_stride(7) 2332 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2333 } 2334 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_eq_8_strided_a)2335 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_eq_8_strided_a) { 2336 TEST_REQUIRES_ARM_NEON_FMA; 2337 GemmMicrokernelTester() 2338 .mr(1) 2339 .nr(4) 2340 .kr(8) 2341 .sr(1) 2342 .m(1) 2343 .n(4) 2344 .k(8) 2345 .a_stride(11) 2346 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2347 } 2348 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_eq_8_subtile)2349 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_eq_8_subtile) { 2350 TEST_REQUIRES_ARM_NEON_FMA; 2351 for (uint32_t n = 1; n <= 4; n++) { 2352 for (uint32_t m = 1; m <= 1; m++) { 2353 GemmMicrokernelTester() 2354 .mr(1) 2355 .nr(4) 2356 .kr(8) 2357 .sr(1) 2358 .m(m) 2359 .n(n) 2360 .k(8) 2361 .iterations(1) 2362 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2363 } 2364 } 2365 } 2366 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_eq_8_subtile_m)2367 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_eq_8_subtile_m) { 2368 TEST_REQUIRES_ARM_NEON_FMA; 2369 for (uint32_t m = 1; m <= 1; m++) { 2370 GemmMicrokernelTester() 2371 .mr(1) 2372 .nr(4) 2373 .kr(8) 2374 .sr(1) 2375 .m(m) 2376 .n(4) 2377 .k(8) 2378 .iterations(1) 2379 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2380 } 2381 } 2382 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_eq_8_subtile_n)2383 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_eq_8_subtile_n) { 2384 TEST_REQUIRES_ARM_NEON_FMA; 2385 for (uint32_t n = 1; n <= 4; n++) { 2386 GemmMicrokernelTester() 2387 .mr(1) 2388 .nr(4) 2389 .kr(8) 2390 .sr(1) 2391 .m(1) 2392 .n(n) 2393 .k(8) 2394 .iterations(1) 2395 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2396 } 2397 } 2398 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_lt_8)2399 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_lt_8) { 2400 TEST_REQUIRES_ARM_NEON_FMA; 2401 for (size_t k = 1; k < 8; k++) { 2402 GemmMicrokernelTester() 2403 .mr(1) 2404 .nr(4) 2405 .kr(8) 2406 .sr(1) 2407 .m(1) 2408 .n(4) 2409 .k(k) 2410 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2411 } 2412 } 2413 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_lt_8_strided_a)2414 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_lt_8_strided_a) { 2415 TEST_REQUIRES_ARM_NEON_FMA; 2416 for (size_t k = 1; k < 8; k++) { 2417 GemmMicrokernelTester() 2418 .mr(1) 2419 .nr(4) 2420 .kr(8) 2421 .sr(1) 2422 .m(1) 2423 .n(4) 2424 .k(k) 2425 .a_stride(11) 2426 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2427 } 2428 } 2429 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_lt_8_subtile)2430 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_lt_8_subtile) { 2431 TEST_REQUIRES_ARM_NEON_FMA; 2432 for (size_t k = 1; k < 8; k++) { 2433 for (uint32_t n = 1; n <= 4; n++) { 2434 for (uint32_t m = 1; m <= 1; m++) { 2435 GemmMicrokernelTester() 2436 .mr(1) 2437 .nr(4) 2438 .kr(8) 2439 .sr(1) 2440 .m(m) 2441 .n(n) 2442 .k(k) 2443 .iterations(1) 2444 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2445 } 2446 } 2447 } 2448 } 2449 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_gt_8)2450 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_gt_8) { 2451 TEST_REQUIRES_ARM_NEON_FMA; 2452 for (size_t k = 9; k < 16; k++) { 2453 GemmMicrokernelTester() 2454 .mr(1) 2455 .nr(4) 2456 .kr(8) 2457 .sr(1) 2458 .m(1) 2459 .n(4) 2460 .k(k) 2461 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2462 } 2463 } 2464 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_gt_8_strided_a)2465 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_gt_8_strided_a) { 2466 TEST_REQUIRES_ARM_NEON_FMA; 2467 for (size_t k = 9; k < 16; k++) { 2468 GemmMicrokernelTester() 2469 .mr(1) 2470 .nr(4) 2471 .kr(8) 2472 .sr(1) 2473 .m(1) 2474 .n(4) 2475 .k(k) 2476 .a_stride(19) 2477 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2478 } 2479 } 2480 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_gt_8_subtile)2481 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_gt_8_subtile) { 2482 TEST_REQUIRES_ARM_NEON_FMA; 2483 for (size_t k = 9; k < 16; k++) { 2484 for (uint32_t n = 1; n <= 4; n++) { 2485 for (uint32_t m = 1; m <= 1; m++) { 2486 GemmMicrokernelTester() 2487 .mr(1) 2488 .nr(4) 2489 .kr(8) 2490 .sr(1) 2491 .m(m) 2492 .n(n) 2493 .k(k) 2494 .iterations(1) 2495 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2496 } 2497 } 2498 } 2499 } 2500 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_div_8)2501 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_div_8) { 2502 TEST_REQUIRES_ARM_NEON_FMA; 2503 for (size_t k = 16; k <= 80; k += 8) { 2504 GemmMicrokernelTester() 2505 .mr(1) 2506 .nr(4) 2507 .kr(8) 2508 .sr(1) 2509 .m(1) 2510 .n(4) 2511 .k(k) 2512 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2513 } 2514 } 2515 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_div_8_strided_a)2516 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_div_8_strided_a) { 2517 TEST_REQUIRES_ARM_NEON_FMA; 2518 for (size_t k = 16; k <= 80; k += 8) { 2519 GemmMicrokernelTester() 2520 .mr(1) 2521 .nr(4) 2522 .kr(8) 2523 .sr(1) 2524 .m(1) 2525 .n(4) 2526 .k(k) 2527 .a_stride(83) 2528 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2529 } 2530 } 2531 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,k_div_8_subtile)2532 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, k_div_8_subtile) { 2533 TEST_REQUIRES_ARM_NEON_FMA; 2534 for (size_t k = 16; k <= 80; k += 8) { 2535 for (uint32_t n = 1; n <= 4; n++) { 2536 for (uint32_t m = 1; m <= 1; m++) { 2537 GemmMicrokernelTester() 2538 .mr(1) 2539 .nr(4) 2540 .kr(8) 2541 .sr(1) 2542 .m(m) 2543 .n(n) 2544 .k(k) 2545 .iterations(1) 2546 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2547 } 2548 } 2549 } 2550 } 2551 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,n_gt_4)2552 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, n_gt_4) { 2553 TEST_REQUIRES_ARM_NEON_FMA; 2554 for (uint32_t n = 5; n < 8; n++) { 2555 for (size_t k = 1; k <= 40; k += 9) { 2556 GemmMicrokernelTester() 2557 .mr(1) 2558 .nr(4) 2559 .kr(8) 2560 .sr(1) 2561 .m(1) 2562 .n(n) 2563 .k(k) 2564 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2565 } 2566 } 2567 } 2568 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,n_gt_4_strided_cn)2569 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, n_gt_4_strided_cn) { 2570 TEST_REQUIRES_ARM_NEON_FMA; 2571 for (uint32_t n = 5; n < 8; n++) { 2572 for (size_t k = 1; k <= 40; k += 9) { 2573 GemmMicrokernelTester() 2574 .mr(1) 2575 .nr(4) 2576 .kr(8) 2577 .sr(1) 2578 .m(1) 2579 .n(n) 2580 .k(k) 2581 .cn_stride(7) 2582 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2583 } 2584 } 2585 } 2586 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,n_gt_4_strided_a)2587 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, n_gt_4_strided_a) { 2588 TEST_REQUIRES_ARM_NEON_FMA; 2589 for (uint32_t n = 5; n < 8; n++) { 2590 for (size_t k = 1; k <= 40; k += 9) { 2591 GemmMicrokernelTester() 2592 .mr(1) 2593 .nr(4) 2594 .kr(8) 2595 .sr(1) 2596 .m(1) 2597 .n(n) 2598 .k(k) 2599 .a_stride(43) 2600 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2601 } 2602 } 2603 } 2604 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,n_gt_4_subtile)2605 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, n_gt_4_subtile) { 2606 TEST_REQUIRES_ARM_NEON_FMA; 2607 for (uint32_t n = 5; n < 8; n++) { 2608 for (size_t k = 1; k <= 40; k += 9) { 2609 for (uint32_t m = 1; m <= 1; m++) { 2610 GemmMicrokernelTester() 2611 .mr(1) 2612 .nr(4) 2613 .kr(8) 2614 .sr(1) 2615 .m(m) 2616 .n(n) 2617 .k(k) 2618 .iterations(1) 2619 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2620 } 2621 } 2622 } 2623 } 2624 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,n_div_4)2625 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, n_div_4) { 2626 TEST_REQUIRES_ARM_NEON_FMA; 2627 for (uint32_t n = 8; n <= 12; n += 4) { 2628 for (size_t k = 1; k <= 40; k += 9) { 2629 GemmMicrokernelTester() 2630 .mr(1) 2631 .nr(4) 2632 .kr(8) 2633 .sr(1) 2634 .m(1) 2635 .n(n) 2636 .k(k) 2637 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2638 } 2639 } 2640 } 2641 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,n_div_4_strided_cn)2642 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, n_div_4_strided_cn) { 2643 TEST_REQUIRES_ARM_NEON_FMA; 2644 for (uint32_t n = 8; n <= 12; n += 4) { 2645 for (size_t k = 1; k <= 40; k += 9) { 2646 GemmMicrokernelTester() 2647 .mr(1) 2648 .nr(4) 2649 .kr(8) 2650 .sr(1) 2651 .m(1) 2652 .n(n) 2653 .k(k) 2654 .cn_stride(7) 2655 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2656 } 2657 } 2658 } 2659 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,n_div_4_strided_a)2660 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, n_div_4_strided_a) { 2661 TEST_REQUIRES_ARM_NEON_FMA; 2662 for (uint32_t n = 8; n <= 12; n += 4) { 2663 for (size_t k = 1; k <= 40; k += 9) { 2664 GemmMicrokernelTester() 2665 .mr(1) 2666 .nr(4) 2667 .kr(8) 2668 .sr(1) 2669 .m(1) 2670 .n(n) 2671 .k(k) 2672 .a_stride(43) 2673 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2674 } 2675 } 2676 } 2677 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,n_div_4_subtile)2678 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, n_div_4_subtile) { 2679 TEST_REQUIRES_ARM_NEON_FMA; 2680 for (uint32_t n = 8; n <= 12; n += 4) { 2681 for (size_t k = 1; k <= 40; k += 9) { 2682 for (uint32_t m = 1; m <= 1; m++) { 2683 GemmMicrokernelTester() 2684 .mr(1) 2685 .nr(4) 2686 .kr(8) 2687 .sr(1) 2688 .m(m) 2689 .n(n) 2690 .k(k) 2691 .iterations(1) 2692 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2693 } 2694 } 2695 } 2696 } 2697 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,strided_cm_subtile)2698 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, strided_cm_subtile) { 2699 TEST_REQUIRES_ARM_NEON_FMA; 2700 for (size_t k = 1; k <= 40; k += 9) { 2701 for (uint32_t n = 1; n <= 4; n++) { 2702 for (uint32_t m = 1; m <= 1; m++) { 2703 GemmMicrokernelTester() 2704 .mr(1) 2705 .nr(4) 2706 .kr(8) 2707 .sr(1) 2708 .m(m) 2709 .n(n) 2710 .k(k) 2711 .cm_stride(7) 2712 .iterations(1) 2713 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2714 } 2715 } 2716 } 2717 } 2718 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,qmin)2719 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, qmin) { 2720 TEST_REQUIRES_ARM_NEON_FMA; 2721 GemmMicrokernelTester() 2722 .mr(1) 2723 .nr(4) 2724 .kr(8) 2725 .sr(1) 2726 .m(1) 2727 .n(4) 2728 .k(8) 2729 .qmin(128) 2730 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2731 } 2732 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,qmax)2733 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, qmax) { 2734 TEST_REQUIRES_ARM_NEON_FMA; 2735 GemmMicrokernelTester() 2736 .mr(1) 2737 .nr(4) 2738 .kr(8) 2739 .sr(1) 2740 .m(1) 2741 .n(4) 2742 .k(8) 2743 .qmax(128) 2744 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2745 } 2746 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP,strided_cm)2747 TEST(BF16_GEMM_MINMAX_1X4C8__NEONFMA_ZIP, strided_cm) { 2748 TEST_REQUIRES_ARM_NEON_FMA; 2749 GemmMicrokernelTester() 2750 .mr(1) 2751 .nr(4) 2752 .kr(8) 2753 .sr(1) 2754 .m(1) 2755 .n(4) 2756 .k(8) 2757 .cm_stride(7) 2758 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2759 } 2760 #endif // XNN_ARCH_ARM64 2761 2762 2763 #if XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_eq_8)2764 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_eq_8) { 2765 TEST_REQUIRES_ARM_NEON_FMA; 2766 GemmMicrokernelTester() 2767 .mr(2) 2768 .nr(4) 2769 .kr(8) 2770 .sr(1) 2771 .m(2) 2772 .n(4) 2773 .k(8) 2774 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2775 } 2776 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,strided_cn)2777 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, strided_cn) { 2778 TEST_REQUIRES_ARM_NEON_FMA; 2779 GemmMicrokernelTester() 2780 .mr(2) 2781 .nr(4) 2782 .kr(8) 2783 .sr(1) 2784 .m(2) 2785 .n(4) 2786 .k(8) 2787 .cn_stride(7) 2788 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2789 } 2790 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_eq_8_strided_a)2791 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_eq_8_strided_a) { 2792 TEST_REQUIRES_ARM_NEON_FMA; 2793 GemmMicrokernelTester() 2794 .mr(2) 2795 .nr(4) 2796 .kr(8) 2797 .sr(1) 2798 .m(2) 2799 .n(4) 2800 .k(8) 2801 .a_stride(11) 2802 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2803 } 2804 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_eq_8_subtile)2805 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_eq_8_subtile) { 2806 TEST_REQUIRES_ARM_NEON_FMA; 2807 for (uint32_t n = 1; n <= 4; n++) { 2808 for (uint32_t m = 1; m <= 2; m++) { 2809 GemmMicrokernelTester() 2810 .mr(2) 2811 .nr(4) 2812 .kr(8) 2813 .sr(1) 2814 .m(m) 2815 .n(n) 2816 .k(8) 2817 .iterations(1) 2818 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2819 } 2820 } 2821 } 2822 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_eq_8_subtile_m)2823 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_eq_8_subtile_m) { 2824 TEST_REQUIRES_ARM_NEON_FMA; 2825 for (uint32_t m = 1; m <= 2; m++) { 2826 GemmMicrokernelTester() 2827 .mr(2) 2828 .nr(4) 2829 .kr(8) 2830 .sr(1) 2831 .m(m) 2832 .n(4) 2833 .k(8) 2834 .iterations(1) 2835 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2836 } 2837 } 2838 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_eq_8_subtile_n)2839 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_eq_8_subtile_n) { 2840 TEST_REQUIRES_ARM_NEON_FMA; 2841 for (uint32_t n = 1; n <= 4; n++) { 2842 GemmMicrokernelTester() 2843 .mr(2) 2844 .nr(4) 2845 .kr(8) 2846 .sr(1) 2847 .m(2) 2848 .n(n) 2849 .k(8) 2850 .iterations(1) 2851 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2852 } 2853 } 2854 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_lt_8)2855 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_lt_8) { 2856 TEST_REQUIRES_ARM_NEON_FMA; 2857 for (size_t k = 1; k < 8; k++) { 2858 GemmMicrokernelTester() 2859 .mr(2) 2860 .nr(4) 2861 .kr(8) 2862 .sr(1) 2863 .m(2) 2864 .n(4) 2865 .k(k) 2866 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2867 } 2868 } 2869 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_lt_8_strided_a)2870 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_lt_8_strided_a) { 2871 TEST_REQUIRES_ARM_NEON_FMA; 2872 for (size_t k = 1; k < 8; k++) { 2873 GemmMicrokernelTester() 2874 .mr(2) 2875 .nr(4) 2876 .kr(8) 2877 .sr(1) 2878 .m(2) 2879 .n(4) 2880 .k(k) 2881 .a_stride(11) 2882 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2883 } 2884 } 2885 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_lt_8_subtile)2886 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_lt_8_subtile) { 2887 TEST_REQUIRES_ARM_NEON_FMA; 2888 for (size_t k = 1; k < 8; k++) { 2889 for (uint32_t n = 1; n <= 4; n++) { 2890 for (uint32_t m = 1; m <= 2; m++) { 2891 GemmMicrokernelTester() 2892 .mr(2) 2893 .nr(4) 2894 .kr(8) 2895 .sr(1) 2896 .m(m) 2897 .n(n) 2898 .k(k) 2899 .iterations(1) 2900 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2901 } 2902 } 2903 } 2904 } 2905 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_gt_8)2906 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_gt_8) { 2907 TEST_REQUIRES_ARM_NEON_FMA; 2908 for (size_t k = 9; k < 16; k++) { 2909 GemmMicrokernelTester() 2910 .mr(2) 2911 .nr(4) 2912 .kr(8) 2913 .sr(1) 2914 .m(2) 2915 .n(4) 2916 .k(k) 2917 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2918 } 2919 } 2920 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_gt_8_strided_a)2921 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_gt_8_strided_a) { 2922 TEST_REQUIRES_ARM_NEON_FMA; 2923 for (size_t k = 9; k < 16; k++) { 2924 GemmMicrokernelTester() 2925 .mr(2) 2926 .nr(4) 2927 .kr(8) 2928 .sr(1) 2929 .m(2) 2930 .n(4) 2931 .k(k) 2932 .a_stride(19) 2933 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2934 } 2935 } 2936 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_gt_8_subtile)2937 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_gt_8_subtile) { 2938 TEST_REQUIRES_ARM_NEON_FMA; 2939 for (size_t k = 9; k < 16; k++) { 2940 for (uint32_t n = 1; n <= 4; n++) { 2941 for (uint32_t m = 1; m <= 2; m++) { 2942 GemmMicrokernelTester() 2943 .mr(2) 2944 .nr(4) 2945 .kr(8) 2946 .sr(1) 2947 .m(m) 2948 .n(n) 2949 .k(k) 2950 .iterations(1) 2951 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2952 } 2953 } 2954 } 2955 } 2956 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_div_8)2957 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_div_8) { 2958 TEST_REQUIRES_ARM_NEON_FMA; 2959 for (size_t k = 16; k <= 80; k += 8) { 2960 GemmMicrokernelTester() 2961 .mr(2) 2962 .nr(4) 2963 .kr(8) 2964 .sr(1) 2965 .m(2) 2966 .n(4) 2967 .k(k) 2968 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2969 } 2970 } 2971 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_div_8_strided_a)2972 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_div_8_strided_a) { 2973 TEST_REQUIRES_ARM_NEON_FMA; 2974 for (size_t k = 16; k <= 80; k += 8) { 2975 GemmMicrokernelTester() 2976 .mr(2) 2977 .nr(4) 2978 .kr(8) 2979 .sr(1) 2980 .m(2) 2981 .n(4) 2982 .k(k) 2983 .a_stride(83) 2984 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 2985 } 2986 } 2987 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,k_div_8_subtile)2988 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, k_div_8_subtile) { 2989 TEST_REQUIRES_ARM_NEON_FMA; 2990 for (size_t k = 16; k <= 80; k += 8) { 2991 for (uint32_t n = 1; n <= 4; n++) { 2992 for (uint32_t m = 1; m <= 2; m++) { 2993 GemmMicrokernelTester() 2994 .mr(2) 2995 .nr(4) 2996 .kr(8) 2997 .sr(1) 2998 .m(m) 2999 .n(n) 3000 .k(k) 3001 .iterations(1) 3002 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3003 } 3004 } 3005 } 3006 } 3007 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,n_gt_4)3008 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, n_gt_4) { 3009 TEST_REQUIRES_ARM_NEON_FMA; 3010 for (uint32_t n = 5; n < 8; n++) { 3011 for (size_t k = 1; k <= 40; k += 9) { 3012 GemmMicrokernelTester() 3013 .mr(2) 3014 .nr(4) 3015 .kr(8) 3016 .sr(1) 3017 .m(2) 3018 .n(n) 3019 .k(k) 3020 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3021 } 3022 } 3023 } 3024 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,n_gt_4_strided_cn)3025 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, n_gt_4_strided_cn) { 3026 TEST_REQUIRES_ARM_NEON_FMA; 3027 for (uint32_t n = 5; n < 8; n++) { 3028 for (size_t k = 1; k <= 40; k += 9) { 3029 GemmMicrokernelTester() 3030 .mr(2) 3031 .nr(4) 3032 .kr(8) 3033 .sr(1) 3034 .m(2) 3035 .n(n) 3036 .k(k) 3037 .cn_stride(7) 3038 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3039 } 3040 } 3041 } 3042 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,n_gt_4_strided_a)3043 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, n_gt_4_strided_a) { 3044 TEST_REQUIRES_ARM_NEON_FMA; 3045 for (uint32_t n = 5; n < 8; n++) { 3046 for (size_t k = 1; k <= 40; k += 9) { 3047 GemmMicrokernelTester() 3048 .mr(2) 3049 .nr(4) 3050 .kr(8) 3051 .sr(1) 3052 .m(2) 3053 .n(n) 3054 .k(k) 3055 .a_stride(43) 3056 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3057 } 3058 } 3059 } 3060 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,n_gt_4_subtile)3061 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, n_gt_4_subtile) { 3062 TEST_REQUIRES_ARM_NEON_FMA; 3063 for (uint32_t n = 5; n < 8; n++) { 3064 for (size_t k = 1; k <= 40; k += 9) { 3065 for (uint32_t m = 1; m <= 2; m++) { 3066 GemmMicrokernelTester() 3067 .mr(2) 3068 .nr(4) 3069 .kr(8) 3070 .sr(1) 3071 .m(m) 3072 .n(n) 3073 .k(k) 3074 .iterations(1) 3075 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3076 } 3077 } 3078 } 3079 } 3080 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,n_div_4)3081 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, n_div_4) { 3082 TEST_REQUIRES_ARM_NEON_FMA; 3083 for (uint32_t n = 8; n <= 12; n += 4) { 3084 for (size_t k = 1; k <= 40; k += 9) { 3085 GemmMicrokernelTester() 3086 .mr(2) 3087 .nr(4) 3088 .kr(8) 3089 .sr(1) 3090 .m(2) 3091 .n(n) 3092 .k(k) 3093 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3094 } 3095 } 3096 } 3097 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,n_div_4_strided_cn)3098 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, n_div_4_strided_cn) { 3099 TEST_REQUIRES_ARM_NEON_FMA; 3100 for (uint32_t n = 8; n <= 12; n += 4) { 3101 for (size_t k = 1; k <= 40; k += 9) { 3102 GemmMicrokernelTester() 3103 .mr(2) 3104 .nr(4) 3105 .kr(8) 3106 .sr(1) 3107 .m(2) 3108 .n(n) 3109 .k(k) 3110 .cn_stride(7) 3111 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3112 } 3113 } 3114 } 3115 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,n_div_4_strided_a)3116 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, n_div_4_strided_a) { 3117 TEST_REQUIRES_ARM_NEON_FMA; 3118 for (uint32_t n = 8; n <= 12; n += 4) { 3119 for (size_t k = 1; k <= 40; k += 9) { 3120 GemmMicrokernelTester() 3121 .mr(2) 3122 .nr(4) 3123 .kr(8) 3124 .sr(1) 3125 .m(2) 3126 .n(n) 3127 .k(k) 3128 .a_stride(43) 3129 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3130 } 3131 } 3132 } 3133 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,n_div_4_subtile)3134 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, n_div_4_subtile) { 3135 TEST_REQUIRES_ARM_NEON_FMA; 3136 for (uint32_t n = 8; n <= 12; n += 4) { 3137 for (size_t k = 1; k <= 40; k += 9) { 3138 for (uint32_t m = 1; m <= 2; m++) { 3139 GemmMicrokernelTester() 3140 .mr(2) 3141 .nr(4) 3142 .kr(8) 3143 .sr(1) 3144 .m(m) 3145 .n(n) 3146 .k(k) 3147 .iterations(1) 3148 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3149 } 3150 } 3151 } 3152 } 3153 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,strided_cm_subtile)3154 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, strided_cm_subtile) { 3155 TEST_REQUIRES_ARM_NEON_FMA; 3156 for (size_t k = 1; k <= 40; k += 9) { 3157 for (uint32_t n = 1; n <= 4; n++) { 3158 for (uint32_t m = 1; m <= 2; m++) { 3159 GemmMicrokernelTester() 3160 .mr(2) 3161 .nr(4) 3162 .kr(8) 3163 .sr(1) 3164 .m(m) 3165 .n(n) 3166 .k(k) 3167 .cm_stride(7) 3168 .iterations(1) 3169 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3170 } 3171 } 3172 } 3173 } 3174 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,qmin)3175 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, qmin) { 3176 TEST_REQUIRES_ARM_NEON_FMA; 3177 GemmMicrokernelTester() 3178 .mr(2) 3179 .nr(4) 3180 .kr(8) 3181 .sr(1) 3182 .m(2) 3183 .n(4) 3184 .k(8) 3185 .qmin(128) 3186 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3187 } 3188 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,qmax)3189 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, qmax) { 3190 TEST_REQUIRES_ARM_NEON_FMA; 3191 GemmMicrokernelTester() 3192 .mr(2) 3193 .nr(4) 3194 .kr(8) 3195 .sr(1) 3196 .m(2) 3197 .n(4) 3198 .k(8) 3199 .qmax(128) 3200 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3201 } 3202 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP,strided_cm)3203 TEST(BF16_GEMM_MINMAX_2X4C8__NEONFMA_ZIP, strided_cm) { 3204 TEST_REQUIRES_ARM_NEON_FMA; 3205 GemmMicrokernelTester() 3206 .mr(2) 3207 .nr(4) 3208 .kr(8) 3209 .sr(1) 3210 .m(2) 3211 .n(4) 3212 .k(8) 3213 .cm_stride(7) 3214 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3215 } 3216 #endif // XNN_ARCH_ARM64 3217 3218 3219 #if XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_eq_8)3220 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_eq_8) { 3221 TEST_REQUIRES_ARM_NEON_FMA; 3222 GemmMicrokernelTester() 3223 .mr(3) 3224 .nr(4) 3225 .kr(8) 3226 .sr(1) 3227 .m(3) 3228 .n(4) 3229 .k(8) 3230 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3231 } 3232 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,strided_cn)3233 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, strided_cn) { 3234 TEST_REQUIRES_ARM_NEON_FMA; 3235 GemmMicrokernelTester() 3236 .mr(3) 3237 .nr(4) 3238 .kr(8) 3239 .sr(1) 3240 .m(3) 3241 .n(4) 3242 .k(8) 3243 .cn_stride(7) 3244 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3245 } 3246 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_eq_8_strided_a)3247 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_eq_8_strided_a) { 3248 TEST_REQUIRES_ARM_NEON_FMA; 3249 GemmMicrokernelTester() 3250 .mr(3) 3251 .nr(4) 3252 .kr(8) 3253 .sr(1) 3254 .m(3) 3255 .n(4) 3256 .k(8) 3257 .a_stride(11) 3258 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3259 } 3260 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_eq_8_subtile)3261 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_eq_8_subtile) { 3262 TEST_REQUIRES_ARM_NEON_FMA; 3263 for (uint32_t n = 1; n <= 4; n++) { 3264 for (uint32_t m = 1; m <= 3; m++) { 3265 GemmMicrokernelTester() 3266 .mr(3) 3267 .nr(4) 3268 .kr(8) 3269 .sr(1) 3270 .m(m) 3271 .n(n) 3272 .k(8) 3273 .iterations(1) 3274 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3275 } 3276 } 3277 } 3278 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_eq_8_subtile_m)3279 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_eq_8_subtile_m) { 3280 TEST_REQUIRES_ARM_NEON_FMA; 3281 for (uint32_t m = 1; m <= 3; m++) { 3282 GemmMicrokernelTester() 3283 .mr(3) 3284 .nr(4) 3285 .kr(8) 3286 .sr(1) 3287 .m(m) 3288 .n(4) 3289 .k(8) 3290 .iterations(1) 3291 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3292 } 3293 } 3294 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_eq_8_subtile_n)3295 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_eq_8_subtile_n) { 3296 TEST_REQUIRES_ARM_NEON_FMA; 3297 for (uint32_t n = 1; n <= 4; n++) { 3298 GemmMicrokernelTester() 3299 .mr(3) 3300 .nr(4) 3301 .kr(8) 3302 .sr(1) 3303 .m(3) 3304 .n(n) 3305 .k(8) 3306 .iterations(1) 3307 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3308 } 3309 } 3310 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_lt_8)3311 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_lt_8) { 3312 TEST_REQUIRES_ARM_NEON_FMA; 3313 for (size_t k = 1; k < 8; k++) { 3314 GemmMicrokernelTester() 3315 .mr(3) 3316 .nr(4) 3317 .kr(8) 3318 .sr(1) 3319 .m(3) 3320 .n(4) 3321 .k(k) 3322 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3323 } 3324 } 3325 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_lt_8_strided_a)3326 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_lt_8_strided_a) { 3327 TEST_REQUIRES_ARM_NEON_FMA; 3328 for (size_t k = 1; k < 8; k++) { 3329 GemmMicrokernelTester() 3330 .mr(3) 3331 .nr(4) 3332 .kr(8) 3333 .sr(1) 3334 .m(3) 3335 .n(4) 3336 .k(k) 3337 .a_stride(11) 3338 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3339 } 3340 } 3341 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_lt_8_subtile)3342 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_lt_8_subtile) { 3343 TEST_REQUIRES_ARM_NEON_FMA; 3344 for (size_t k = 1; k < 8; k++) { 3345 for (uint32_t n = 1; n <= 4; n++) { 3346 for (uint32_t m = 1; m <= 3; m++) { 3347 GemmMicrokernelTester() 3348 .mr(3) 3349 .nr(4) 3350 .kr(8) 3351 .sr(1) 3352 .m(m) 3353 .n(n) 3354 .k(k) 3355 .iterations(1) 3356 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3357 } 3358 } 3359 } 3360 } 3361 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_gt_8)3362 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_gt_8) { 3363 TEST_REQUIRES_ARM_NEON_FMA; 3364 for (size_t k = 9; k < 16; k++) { 3365 GemmMicrokernelTester() 3366 .mr(3) 3367 .nr(4) 3368 .kr(8) 3369 .sr(1) 3370 .m(3) 3371 .n(4) 3372 .k(k) 3373 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3374 } 3375 } 3376 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_gt_8_strided_a)3377 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_gt_8_strided_a) { 3378 TEST_REQUIRES_ARM_NEON_FMA; 3379 for (size_t k = 9; k < 16; k++) { 3380 GemmMicrokernelTester() 3381 .mr(3) 3382 .nr(4) 3383 .kr(8) 3384 .sr(1) 3385 .m(3) 3386 .n(4) 3387 .k(k) 3388 .a_stride(19) 3389 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3390 } 3391 } 3392 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_gt_8_subtile)3393 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_gt_8_subtile) { 3394 TEST_REQUIRES_ARM_NEON_FMA; 3395 for (size_t k = 9; k < 16; k++) { 3396 for (uint32_t n = 1; n <= 4; n++) { 3397 for (uint32_t m = 1; m <= 3; m++) { 3398 GemmMicrokernelTester() 3399 .mr(3) 3400 .nr(4) 3401 .kr(8) 3402 .sr(1) 3403 .m(m) 3404 .n(n) 3405 .k(k) 3406 .iterations(1) 3407 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3408 } 3409 } 3410 } 3411 } 3412 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_div_8)3413 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_div_8) { 3414 TEST_REQUIRES_ARM_NEON_FMA; 3415 for (size_t k = 16; k <= 80; k += 8) { 3416 GemmMicrokernelTester() 3417 .mr(3) 3418 .nr(4) 3419 .kr(8) 3420 .sr(1) 3421 .m(3) 3422 .n(4) 3423 .k(k) 3424 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3425 } 3426 } 3427 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_div_8_strided_a)3428 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_div_8_strided_a) { 3429 TEST_REQUIRES_ARM_NEON_FMA; 3430 for (size_t k = 16; k <= 80; k += 8) { 3431 GemmMicrokernelTester() 3432 .mr(3) 3433 .nr(4) 3434 .kr(8) 3435 .sr(1) 3436 .m(3) 3437 .n(4) 3438 .k(k) 3439 .a_stride(83) 3440 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3441 } 3442 } 3443 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,k_div_8_subtile)3444 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, k_div_8_subtile) { 3445 TEST_REQUIRES_ARM_NEON_FMA; 3446 for (size_t k = 16; k <= 80; k += 8) { 3447 for (uint32_t n = 1; n <= 4; n++) { 3448 for (uint32_t m = 1; m <= 3; m++) { 3449 GemmMicrokernelTester() 3450 .mr(3) 3451 .nr(4) 3452 .kr(8) 3453 .sr(1) 3454 .m(m) 3455 .n(n) 3456 .k(k) 3457 .iterations(1) 3458 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3459 } 3460 } 3461 } 3462 } 3463 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,n_gt_4)3464 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, n_gt_4) { 3465 TEST_REQUIRES_ARM_NEON_FMA; 3466 for (uint32_t n = 5; n < 8; n++) { 3467 for (size_t k = 1; k <= 40; k += 9) { 3468 GemmMicrokernelTester() 3469 .mr(3) 3470 .nr(4) 3471 .kr(8) 3472 .sr(1) 3473 .m(3) 3474 .n(n) 3475 .k(k) 3476 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3477 } 3478 } 3479 } 3480 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,n_gt_4_strided_cn)3481 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, n_gt_4_strided_cn) { 3482 TEST_REQUIRES_ARM_NEON_FMA; 3483 for (uint32_t n = 5; n < 8; n++) { 3484 for (size_t k = 1; k <= 40; k += 9) { 3485 GemmMicrokernelTester() 3486 .mr(3) 3487 .nr(4) 3488 .kr(8) 3489 .sr(1) 3490 .m(3) 3491 .n(n) 3492 .k(k) 3493 .cn_stride(7) 3494 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3495 } 3496 } 3497 } 3498 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,n_gt_4_strided_a)3499 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, n_gt_4_strided_a) { 3500 TEST_REQUIRES_ARM_NEON_FMA; 3501 for (uint32_t n = 5; n < 8; n++) { 3502 for (size_t k = 1; k <= 40; k += 9) { 3503 GemmMicrokernelTester() 3504 .mr(3) 3505 .nr(4) 3506 .kr(8) 3507 .sr(1) 3508 .m(3) 3509 .n(n) 3510 .k(k) 3511 .a_stride(43) 3512 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3513 } 3514 } 3515 } 3516 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,n_gt_4_subtile)3517 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, n_gt_4_subtile) { 3518 TEST_REQUIRES_ARM_NEON_FMA; 3519 for (uint32_t n = 5; n < 8; n++) { 3520 for (size_t k = 1; k <= 40; k += 9) { 3521 for (uint32_t m = 1; m <= 3; m++) { 3522 GemmMicrokernelTester() 3523 .mr(3) 3524 .nr(4) 3525 .kr(8) 3526 .sr(1) 3527 .m(m) 3528 .n(n) 3529 .k(k) 3530 .iterations(1) 3531 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3532 } 3533 } 3534 } 3535 } 3536 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,n_div_4)3537 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, n_div_4) { 3538 TEST_REQUIRES_ARM_NEON_FMA; 3539 for (uint32_t n = 8; n <= 12; n += 4) { 3540 for (size_t k = 1; k <= 40; k += 9) { 3541 GemmMicrokernelTester() 3542 .mr(3) 3543 .nr(4) 3544 .kr(8) 3545 .sr(1) 3546 .m(3) 3547 .n(n) 3548 .k(k) 3549 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3550 } 3551 } 3552 } 3553 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,n_div_4_strided_cn)3554 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, n_div_4_strided_cn) { 3555 TEST_REQUIRES_ARM_NEON_FMA; 3556 for (uint32_t n = 8; n <= 12; n += 4) { 3557 for (size_t k = 1; k <= 40; k += 9) { 3558 GemmMicrokernelTester() 3559 .mr(3) 3560 .nr(4) 3561 .kr(8) 3562 .sr(1) 3563 .m(3) 3564 .n(n) 3565 .k(k) 3566 .cn_stride(7) 3567 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3568 } 3569 } 3570 } 3571 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,n_div_4_strided_a)3572 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, n_div_4_strided_a) { 3573 TEST_REQUIRES_ARM_NEON_FMA; 3574 for (uint32_t n = 8; n <= 12; n += 4) { 3575 for (size_t k = 1; k <= 40; k += 9) { 3576 GemmMicrokernelTester() 3577 .mr(3) 3578 .nr(4) 3579 .kr(8) 3580 .sr(1) 3581 .m(3) 3582 .n(n) 3583 .k(k) 3584 .a_stride(43) 3585 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3586 } 3587 } 3588 } 3589 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,n_div_4_subtile)3590 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, n_div_4_subtile) { 3591 TEST_REQUIRES_ARM_NEON_FMA; 3592 for (uint32_t n = 8; n <= 12; n += 4) { 3593 for (size_t k = 1; k <= 40; k += 9) { 3594 for (uint32_t m = 1; m <= 3; m++) { 3595 GemmMicrokernelTester() 3596 .mr(3) 3597 .nr(4) 3598 .kr(8) 3599 .sr(1) 3600 .m(m) 3601 .n(n) 3602 .k(k) 3603 .iterations(1) 3604 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3605 } 3606 } 3607 } 3608 } 3609 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,strided_cm_subtile)3610 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, strided_cm_subtile) { 3611 TEST_REQUIRES_ARM_NEON_FMA; 3612 for (size_t k = 1; k <= 40; k += 9) { 3613 for (uint32_t n = 1; n <= 4; n++) { 3614 for (uint32_t m = 1; m <= 3; m++) { 3615 GemmMicrokernelTester() 3616 .mr(3) 3617 .nr(4) 3618 .kr(8) 3619 .sr(1) 3620 .m(m) 3621 .n(n) 3622 .k(k) 3623 .cm_stride(7) 3624 .iterations(1) 3625 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3626 } 3627 } 3628 } 3629 } 3630 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,qmin)3631 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, qmin) { 3632 TEST_REQUIRES_ARM_NEON_FMA; 3633 GemmMicrokernelTester() 3634 .mr(3) 3635 .nr(4) 3636 .kr(8) 3637 .sr(1) 3638 .m(3) 3639 .n(4) 3640 .k(8) 3641 .qmin(128) 3642 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3643 } 3644 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,qmax)3645 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, qmax) { 3646 TEST_REQUIRES_ARM_NEON_FMA; 3647 GemmMicrokernelTester() 3648 .mr(3) 3649 .nr(4) 3650 .kr(8) 3651 .sr(1) 3652 .m(3) 3653 .n(4) 3654 .k(8) 3655 .qmax(128) 3656 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3657 } 3658 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP,strided_cm)3659 TEST(BF16_GEMM_MINMAX_3X4C8__NEONFMA_ZIP, strided_cm) { 3660 TEST_REQUIRES_ARM_NEON_FMA; 3661 GemmMicrokernelTester() 3662 .mr(3) 3663 .nr(4) 3664 .kr(8) 3665 .sr(1) 3666 .m(3) 3667 .n(4) 3668 .k(8) 3669 .cm_stride(7) 3670 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3671 } 3672 #endif // XNN_ARCH_ARM64 3673 3674 3675 #if XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_eq_8)3676 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_eq_8) { 3677 TEST_REQUIRES_ARM_NEON_FMA; 3678 GemmMicrokernelTester() 3679 .mr(4) 3680 .nr(4) 3681 .kr(8) 3682 .sr(1) 3683 .m(4) 3684 .n(4) 3685 .k(8) 3686 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3687 } 3688 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,strided_cn)3689 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, strided_cn) { 3690 TEST_REQUIRES_ARM_NEON_FMA; 3691 GemmMicrokernelTester() 3692 .mr(4) 3693 .nr(4) 3694 .kr(8) 3695 .sr(1) 3696 .m(4) 3697 .n(4) 3698 .k(8) 3699 .cn_stride(7) 3700 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3701 } 3702 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_eq_8_strided_a)3703 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_eq_8_strided_a) { 3704 TEST_REQUIRES_ARM_NEON_FMA; 3705 GemmMicrokernelTester() 3706 .mr(4) 3707 .nr(4) 3708 .kr(8) 3709 .sr(1) 3710 .m(4) 3711 .n(4) 3712 .k(8) 3713 .a_stride(11) 3714 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3715 } 3716 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_eq_8_subtile)3717 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_eq_8_subtile) { 3718 TEST_REQUIRES_ARM_NEON_FMA; 3719 for (uint32_t n = 1; n <= 4; n++) { 3720 for (uint32_t m = 1; m <= 4; m++) { 3721 GemmMicrokernelTester() 3722 .mr(4) 3723 .nr(4) 3724 .kr(8) 3725 .sr(1) 3726 .m(m) 3727 .n(n) 3728 .k(8) 3729 .iterations(1) 3730 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3731 } 3732 } 3733 } 3734 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_eq_8_subtile_m)3735 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_eq_8_subtile_m) { 3736 TEST_REQUIRES_ARM_NEON_FMA; 3737 for (uint32_t m = 1; m <= 4; m++) { 3738 GemmMicrokernelTester() 3739 .mr(4) 3740 .nr(4) 3741 .kr(8) 3742 .sr(1) 3743 .m(m) 3744 .n(4) 3745 .k(8) 3746 .iterations(1) 3747 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3748 } 3749 } 3750 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_eq_8_subtile_n)3751 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_eq_8_subtile_n) { 3752 TEST_REQUIRES_ARM_NEON_FMA; 3753 for (uint32_t n = 1; n <= 4; n++) { 3754 GemmMicrokernelTester() 3755 .mr(4) 3756 .nr(4) 3757 .kr(8) 3758 .sr(1) 3759 .m(4) 3760 .n(n) 3761 .k(8) 3762 .iterations(1) 3763 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3764 } 3765 } 3766 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_lt_8)3767 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_lt_8) { 3768 TEST_REQUIRES_ARM_NEON_FMA; 3769 for (size_t k = 1; k < 8; k++) { 3770 GemmMicrokernelTester() 3771 .mr(4) 3772 .nr(4) 3773 .kr(8) 3774 .sr(1) 3775 .m(4) 3776 .n(4) 3777 .k(k) 3778 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3779 } 3780 } 3781 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_lt_8_strided_a)3782 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_lt_8_strided_a) { 3783 TEST_REQUIRES_ARM_NEON_FMA; 3784 for (size_t k = 1; k < 8; k++) { 3785 GemmMicrokernelTester() 3786 .mr(4) 3787 .nr(4) 3788 .kr(8) 3789 .sr(1) 3790 .m(4) 3791 .n(4) 3792 .k(k) 3793 .a_stride(11) 3794 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3795 } 3796 } 3797 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_lt_8_subtile)3798 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_lt_8_subtile) { 3799 TEST_REQUIRES_ARM_NEON_FMA; 3800 for (size_t k = 1; k < 8; k++) { 3801 for (uint32_t n = 1; n <= 4; n++) { 3802 for (uint32_t m = 1; m <= 4; m++) { 3803 GemmMicrokernelTester() 3804 .mr(4) 3805 .nr(4) 3806 .kr(8) 3807 .sr(1) 3808 .m(m) 3809 .n(n) 3810 .k(k) 3811 .iterations(1) 3812 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3813 } 3814 } 3815 } 3816 } 3817 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_gt_8)3818 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_gt_8) { 3819 TEST_REQUIRES_ARM_NEON_FMA; 3820 for (size_t k = 9; k < 16; k++) { 3821 GemmMicrokernelTester() 3822 .mr(4) 3823 .nr(4) 3824 .kr(8) 3825 .sr(1) 3826 .m(4) 3827 .n(4) 3828 .k(k) 3829 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3830 } 3831 } 3832 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_gt_8_strided_a)3833 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_gt_8_strided_a) { 3834 TEST_REQUIRES_ARM_NEON_FMA; 3835 for (size_t k = 9; k < 16; k++) { 3836 GemmMicrokernelTester() 3837 .mr(4) 3838 .nr(4) 3839 .kr(8) 3840 .sr(1) 3841 .m(4) 3842 .n(4) 3843 .k(k) 3844 .a_stride(19) 3845 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3846 } 3847 } 3848 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_gt_8_subtile)3849 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_gt_8_subtile) { 3850 TEST_REQUIRES_ARM_NEON_FMA; 3851 for (size_t k = 9; k < 16; k++) { 3852 for (uint32_t n = 1; n <= 4; n++) { 3853 for (uint32_t m = 1; m <= 4; m++) { 3854 GemmMicrokernelTester() 3855 .mr(4) 3856 .nr(4) 3857 .kr(8) 3858 .sr(1) 3859 .m(m) 3860 .n(n) 3861 .k(k) 3862 .iterations(1) 3863 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3864 } 3865 } 3866 } 3867 } 3868 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_div_8)3869 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_div_8) { 3870 TEST_REQUIRES_ARM_NEON_FMA; 3871 for (size_t k = 16; k <= 80; k += 8) { 3872 GemmMicrokernelTester() 3873 .mr(4) 3874 .nr(4) 3875 .kr(8) 3876 .sr(1) 3877 .m(4) 3878 .n(4) 3879 .k(k) 3880 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3881 } 3882 } 3883 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_div_8_strided_a)3884 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_div_8_strided_a) { 3885 TEST_REQUIRES_ARM_NEON_FMA; 3886 for (size_t k = 16; k <= 80; k += 8) { 3887 GemmMicrokernelTester() 3888 .mr(4) 3889 .nr(4) 3890 .kr(8) 3891 .sr(1) 3892 .m(4) 3893 .n(4) 3894 .k(k) 3895 .a_stride(83) 3896 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3897 } 3898 } 3899 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,k_div_8_subtile)3900 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, k_div_8_subtile) { 3901 TEST_REQUIRES_ARM_NEON_FMA; 3902 for (size_t k = 16; k <= 80; k += 8) { 3903 for (uint32_t n = 1; n <= 4; n++) { 3904 for (uint32_t m = 1; m <= 4; m++) { 3905 GemmMicrokernelTester() 3906 .mr(4) 3907 .nr(4) 3908 .kr(8) 3909 .sr(1) 3910 .m(m) 3911 .n(n) 3912 .k(k) 3913 .iterations(1) 3914 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3915 } 3916 } 3917 } 3918 } 3919 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,n_gt_4)3920 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, n_gt_4) { 3921 TEST_REQUIRES_ARM_NEON_FMA; 3922 for (uint32_t n = 5; n < 8; n++) { 3923 for (size_t k = 1; k <= 40; k += 9) { 3924 GemmMicrokernelTester() 3925 .mr(4) 3926 .nr(4) 3927 .kr(8) 3928 .sr(1) 3929 .m(4) 3930 .n(n) 3931 .k(k) 3932 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3933 } 3934 } 3935 } 3936 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,n_gt_4_strided_cn)3937 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, n_gt_4_strided_cn) { 3938 TEST_REQUIRES_ARM_NEON_FMA; 3939 for (uint32_t n = 5; n < 8; n++) { 3940 for (size_t k = 1; k <= 40; k += 9) { 3941 GemmMicrokernelTester() 3942 .mr(4) 3943 .nr(4) 3944 .kr(8) 3945 .sr(1) 3946 .m(4) 3947 .n(n) 3948 .k(k) 3949 .cn_stride(7) 3950 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3951 } 3952 } 3953 } 3954 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,n_gt_4_strided_a)3955 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, n_gt_4_strided_a) { 3956 TEST_REQUIRES_ARM_NEON_FMA; 3957 for (uint32_t n = 5; n < 8; n++) { 3958 for (size_t k = 1; k <= 40; k += 9) { 3959 GemmMicrokernelTester() 3960 .mr(4) 3961 .nr(4) 3962 .kr(8) 3963 .sr(1) 3964 .m(4) 3965 .n(n) 3966 .k(k) 3967 .a_stride(43) 3968 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3969 } 3970 } 3971 } 3972 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,n_gt_4_subtile)3973 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, n_gt_4_subtile) { 3974 TEST_REQUIRES_ARM_NEON_FMA; 3975 for (uint32_t n = 5; n < 8; n++) { 3976 for (size_t k = 1; k <= 40; k += 9) { 3977 for (uint32_t m = 1; m <= 4; m++) { 3978 GemmMicrokernelTester() 3979 .mr(4) 3980 .nr(4) 3981 .kr(8) 3982 .sr(1) 3983 .m(m) 3984 .n(n) 3985 .k(k) 3986 .iterations(1) 3987 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 3988 } 3989 } 3990 } 3991 } 3992 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,n_div_4)3993 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, n_div_4) { 3994 TEST_REQUIRES_ARM_NEON_FMA; 3995 for (uint32_t n = 8; n <= 12; n += 4) { 3996 for (size_t k = 1; k <= 40; k += 9) { 3997 GemmMicrokernelTester() 3998 .mr(4) 3999 .nr(4) 4000 .kr(8) 4001 .sr(1) 4002 .m(4) 4003 .n(n) 4004 .k(k) 4005 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4006 } 4007 } 4008 } 4009 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,n_div_4_strided_cn)4010 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, n_div_4_strided_cn) { 4011 TEST_REQUIRES_ARM_NEON_FMA; 4012 for (uint32_t n = 8; n <= 12; n += 4) { 4013 for (size_t k = 1; k <= 40; k += 9) { 4014 GemmMicrokernelTester() 4015 .mr(4) 4016 .nr(4) 4017 .kr(8) 4018 .sr(1) 4019 .m(4) 4020 .n(n) 4021 .k(k) 4022 .cn_stride(7) 4023 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4024 } 4025 } 4026 } 4027 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,n_div_4_strided_a)4028 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, n_div_4_strided_a) { 4029 TEST_REQUIRES_ARM_NEON_FMA; 4030 for (uint32_t n = 8; n <= 12; n += 4) { 4031 for (size_t k = 1; k <= 40; k += 9) { 4032 GemmMicrokernelTester() 4033 .mr(4) 4034 .nr(4) 4035 .kr(8) 4036 .sr(1) 4037 .m(4) 4038 .n(n) 4039 .k(k) 4040 .a_stride(43) 4041 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4042 } 4043 } 4044 } 4045 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,n_div_4_subtile)4046 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, n_div_4_subtile) { 4047 TEST_REQUIRES_ARM_NEON_FMA; 4048 for (uint32_t n = 8; n <= 12; n += 4) { 4049 for (size_t k = 1; k <= 40; k += 9) { 4050 for (uint32_t m = 1; m <= 4; m++) { 4051 GemmMicrokernelTester() 4052 .mr(4) 4053 .nr(4) 4054 .kr(8) 4055 .sr(1) 4056 .m(m) 4057 .n(n) 4058 .k(k) 4059 .iterations(1) 4060 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4061 } 4062 } 4063 } 4064 } 4065 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,strided_cm_subtile)4066 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, strided_cm_subtile) { 4067 TEST_REQUIRES_ARM_NEON_FMA; 4068 for (size_t k = 1; k <= 40; k += 9) { 4069 for (uint32_t n = 1; n <= 4; n++) { 4070 for (uint32_t m = 1; m <= 4; m++) { 4071 GemmMicrokernelTester() 4072 .mr(4) 4073 .nr(4) 4074 .kr(8) 4075 .sr(1) 4076 .m(m) 4077 .n(n) 4078 .k(k) 4079 .cm_stride(7) 4080 .iterations(1) 4081 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4082 } 4083 } 4084 } 4085 } 4086 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,qmin)4087 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, qmin) { 4088 TEST_REQUIRES_ARM_NEON_FMA; 4089 GemmMicrokernelTester() 4090 .mr(4) 4091 .nr(4) 4092 .kr(8) 4093 .sr(1) 4094 .m(4) 4095 .n(4) 4096 .k(8) 4097 .qmin(128) 4098 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4099 } 4100 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,qmax)4101 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, qmax) { 4102 TEST_REQUIRES_ARM_NEON_FMA; 4103 GemmMicrokernelTester() 4104 .mr(4) 4105 .nr(4) 4106 .kr(8) 4107 .sr(1) 4108 .m(4) 4109 .n(4) 4110 .k(8) 4111 .qmax(128) 4112 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4113 } 4114 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP,strided_cm)4115 TEST(BF16_GEMM_MINMAX_4X4C8__NEONFMA_ZIP, strided_cm) { 4116 TEST_REQUIRES_ARM_NEON_FMA; 4117 GemmMicrokernelTester() 4118 .mr(4) 4119 .nr(4) 4120 .kr(8) 4121 .sr(1) 4122 .m(4) 4123 .n(4) 4124 .k(8) 4125 .cm_stride(7) 4126 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4127 } 4128 #endif // XNN_ARCH_ARM64 4129 4130 4131 #if XNN_ARCH_ARM64 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_eq_8)4132 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_eq_8) { 4133 TEST_REQUIRES_ARM_NEON_FMA; 4134 GemmMicrokernelTester() 4135 .mr(5) 4136 .nr(4) 4137 .kr(8) 4138 .sr(1) 4139 .m(5) 4140 .n(4) 4141 .k(8) 4142 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4143 } 4144 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,strided_cn)4145 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, strided_cn) { 4146 TEST_REQUIRES_ARM_NEON_FMA; 4147 GemmMicrokernelTester() 4148 .mr(5) 4149 .nr(4) 4150 .kr(8) 4151 .sr(1) 4152 .m(5) 4153 .n(4) 4154 .k(8) 4155 .cn_stride(7) 4156 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4157 } 4158 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_eq_8_strided_a)4159 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_eq_8_strided_a) { 4160 TEST_REQUIRES_ARM_NEON_FMA; 4161 GemmMicrokernelTester() 4162 .mr(5) 4163 .nr(4) 4164 .kr(8) 4165 .sr(1) 4166 .m(5) 4167 .n(4) 4168 .k(8) 4169 .a_stride(11) 4170 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4171 } 4172 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_eq_8_subtile)4173 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_eq_8_subtile) { 4174 TEST_REQUIRES_ARM_NEON_FMA; 4175 for (uint32_t n = 1; n <= 4; n++) { 4176 for (uint32_t m = 1; m <= 5; m++) { 4177 GemmMicrokernelTester() 4178 .mr(5) 4179 .nr(4) 4180 .kr(8) 4181 .sr(1) 4182 .m(m) 4183 .n(n) 4184 .k(8) 4185 .iterations(1) 4186 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4187 } 4188 } 4189 } 4190 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_eq_8_subtile_m)4191 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_eq_8_subtile_m) { 4192 TEST_REQUIRES_ARM_NEON_FMA; 4193 for (uint32_t m = 1; m <= 5; m++) { 4194 GemmMicrokernelTester() 4195 .mr(5) 4196 .nr(4) 4197 .kr(8) 4198 .sr(1) 4199 .m(m) 4200 .n(4) 4201 .k(8) 4202 .iterations(1) 4203 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4204 } 4205 } 4206 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_eq_8_subtile_n)4207 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_eq_8_subtile_n) { 4208 TEST_REQUIRES_ARM_NEON_FMA; 4209 for (uint32_t n = 1; n <= 4; n++) { 4210 GemmMicrokernelTester() 4211 .mr(5) 4212 .nr(4) 4213 .kr(8) 4214 .sr(1) 4215 .m(5) 4216 .n(n) 4217 .k(8) 4218 .iterations(1) 4219 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4220 } 4221 } 4222 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_lt_8)4223 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_lt_8) { 4224 TEST_REQUIRES_ARM_NEON_FMA; 4225 for (size_t k = 1; k < 8; k++) { 4226 GemmMicrokernelTester() 4227 .mr(5) 4228 .nr(4) 4229 .kr(8) 4230 .sr(1) 4231 .m(5) 4232 .n(4) 4233 .k(k) 4234 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4235 } 4236 } 4237 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_lt_8_strided_a)4238 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_lt_8_strided_a) { 4239 TEST_REQUIRES_ARM_NEON_FMA; 4240 for (size_t k = 1; k < 8; k++) { 4241 GemmMicrokernelTester() 4242 .mr(5) 4243 .nr(4) 4244 .kr(8) 4245 .sr(1) 4246 .m(5) 4247 .n(4) 4248 .k(k) 4249 .a_stride(11) 4250 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4251 } 4252 } 4253 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_lt_8_subtile)4254 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_lt_8_subtile) { 4255 TEST_REQUIRES_ARM_NEON_FMA; 4256 for (size_t k = 1; k < 8; k++) { 4257 for (uint32_t n = 1; n <= 4; n++) { 4258 for (uint32_t m = 1; m <= 5; m++) { 4259 GemmMicrokernelTester() 4260 .mr(5) 4261 .nr(4) 4262 .kr(8) 4263 .sr(1) 4264 .m(m) 4265 .n(n) 4266 .k(k) 4267 .iterations(1) 4268 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4269 } 4270 } 4271 } 4272 } 4273 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_gt_8)4274 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_gt_8) { 4275 TEST_REQUIRES_ARM_NEON_FMA; 4276 for (size_t k = 9; k < 16; k++) { 4277 GemmMicrokernelTester() 4278 .mr(5) 4279 .nr(4) 4280 .kr(8) 4281 .sr(1) 4282 .m(5) 4283 .n(4) 4284 .k(k) 4285 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4286 } 4287 } 4288 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_gt_8_strided_a)4289 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_gt_8_strided_a) { 4290 TEST_REQUIRES_ARM_NEON_FMA; 4291 for (size_t k = 9; k < 16; k++) { 4292 GemmMicrokernelTester() 4293 .mr(5) 4294 .nr(4) 4295 .kr(8) 4296 .sr(1) 4297 .m(5) 4298 .n(4) 4299 .k(k) 4300 .a_stride(19) 4301 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4302 } 4303 } 4304 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_gt_8_subtile)4305 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_gt_8_subtile) { 4306 TEST_REQUIRES_ARM_NEON_FMA; 4307 for (size_t k = 9; k < 16; k++) { 4308 for (uint32_t n = 1; n <= 4; n++) { 4309 for (uint32_t m = 1; m <= 5; m++) { 4310 GemmMicrokernelTester() 4311 .mr(5) 4312 .nr(4) 4313 .kr(8) 4314 .sr(1) 4315 .m(m) 4316 .n(n) 4317 .k(k) 4318 .iterations(1) 4319 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4320 } 4321 } 4322 } 4323 } 4324 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_div_8)4325 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_div_8) { 4326 TEST_REQUIRES_ARM_NEON_FMA; 4327 for (size_t k = 16; k <= 80; k += 8) { 4328 GemmMicrokernelTester() 4329 .mr(5) 4330 .nr(4) 4331 .kr(8) 4332 .sr(1) 4333 .m(5) 4334 .n(4) 4335 .k(k) 4336 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4337 } 4338 } 4339 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_div_8_strided_a)4340 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_div_8_strided_a) { 4341 TEST_REQUIRES_ARM_NEON_FMA; 4342 for (size_t k = 16; k <= 80; k += 8) { 4343 GemmMicrokernelTester() 4344 .mr(5) 4345 .nr(4) 4346 .kr(8) 4347 .sr(1) 4348 .m(5) 4349 .n(4) 4350 .k(k) 4351 .a_stride(83) 4352 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4353 } 4354 } 4355 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,k_div_8_subtile)4356 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, k_div_8_subtile) { 4357 TEST_REQUIRES_ARM_NEON_FMA; 4358 for (size_t k = 16; k <= 80; k += 8) { 4359 for (uint32_t n = 1; n <= 4; n++) { 4360 for (uint32_t m = 1; m <= 5; m++) { 4361 GemmMicrokernelTester() 4362 .mr(5) 4363 .nr(4) 4364 .kr(8) 4365 .sr(1) 4366 .m(m) 4367 .n(n) 4368 .k(k) 4369 .iterations(1) 4370 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4371 } 4372 } 4373 } 4374 } 4375 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,n_gt_4)4376 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, n_gt_4) { 4377 TEST_REQUIRES_ARM_NEON_FMA; 4378 for (uint32_t n = 5; n < 8; n++) { 4379 for (size_t k = 1; k <= 40; k += 9) { 4380 GemmMicrokernelTester() 4381 .mr(5) 4382 .nr(4) 4383 .kr(8) 4384 .sr(1) 4385 .m(5) 4386 .n(n) 4387 .k(k) 4388 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4389 } 4390 } 4391 } 4392 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,n_gt_4_strided_cn)4393 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, n_gt_4_strided_cn) { 4394 TEST_REQUIRES_ARM_NEON_FMA; 4395 for (uint32_t n = 5; n < 8; n++) { 4396 for (size_t k = 1; k <= 40; k += 9) { 4397 GemmMicrokernelTester() 4398 .mr(5) 4399 .nr(4) 4400 .kr(8) 4401 .sr(1) 4402 .m(5) 4403 .n(n) 4404 .k(k) 4405 .cn_stride(7) 4406 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4407 } 4408 } 4409 } 4410 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,n_gt_4_strided_a)4411 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, n_gt_4_strided_a) { 4412 TEST_REQUIRES_ARM_NEON_FMA; 4413 for (uint32_t n = 5; n < 8; n++) { 4414 for (size_t k = 1; k <= 40; k += 9) { 4415 GemmMicrokernelTester() 4416 .mr(5) 4417 .nr(4) 4418 .kr(8) 4419 .sr(1) 4420 .m(5) 4421 .n(n) 4422 .k(k) 4423 .a_stride(43) 4424 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4425 } 4426 } 4427 } 4428 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,n_gt_4_subtile)4429 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, n_gt_4_subtile) { 4430 TEST_REQUIRES_ARM_NEON_FMA; 4431 for (uint32_t n = 5; n < 8; n++) { 4432 for (size_t k = 1; k <= 40; k += 9) { 4433 for (uint32_t m = 1; m <= 5; m++) { 4434 GemmMicrokernelTester() 4435 .mr(5) 4436 .nr(4) 4437 .kr(8) 4438 .sr(1) 4439 .m(m) 4440 .n(n) 4441 .k(k) 4442 .iterations(1) 4443 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4444 } 4445 } 4446 } 4447 } 4448 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,n_div_4)4449 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, n_div_4) { 4450 TEST_REQUIRES_ARM_NEON_FMA; 4451 for (uint32_t n = 8; n <= 12; n += 4) { 4452 for (size_t k = 1; k <= 40; k += 9) { 4453 GemmMicrokernelTester() 4454 .mr(5) 4455 .nr(4) 4456 .kr(8) 4457 .sr(1) 4458 .m(5) 4459 .n(n) 4460 .k(k) 4461 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4462 } 4463 } 4464 } 4465 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,n_div_4_strided_cn)4466 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, n_div_4_strided_cn) { 4467 TEST_REQUIRES_ARM_NEON_FMA; 4468 for (uint32_t n = 8; n <= 12; n += 4) { 4469 for (size_t k = 1; k <= 40; k += 9) { 4470 GemmMicrokernelTester() 4471 .mr(5) 4472 .nr(4) 4473 .kr(8) 4474 .sr(1) 4475 .m(5) 4476 .n(n) 4477 .k(k) 4478 .cn_stride(7) 4479 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4480 } 4481 } 4482 } 4483 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,n_div_4_strided_a)4484 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, n_div_4_strided_a) { 4485 TEST_REQUIRES_ARM_NEON_FMA; 4486 for (uint32_t n = 8; n <= 12; n += 4) { 4487 for (size_t k = 1; k <= 40; k += 9) { 4488 GemmMicrokernelTester() 4489 .mr(5) 4490 .nr(4) 4491 .kr(8) 4492 .sr(1) 4493 .m(5) 4494 .n(n) 4495 .k(k) 4496 .a_stride(43) 4497 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4498 } 4499 } 4500 } 4501 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,n_div_4_subtile)4502 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, n_div_4_subtile) { 4503 TEST_REQUIRES_ARM_NEON_FMA; 4504 for (uint32_t n = 8; n <= 12; n += 4) { 4505 for (size_t k = 1; k <= 40; k += 9) { 4506 for (uint32_t m = 1; m <= 5; m++) { 4507 GemmMicrokernelTester() 4508 .mr(5) 4509 .nr(4) 4510 .kr(8) 4511 .sr(1) 4512 .m(m) 4513 .n(n) 4514 .k(k) 4515 .iterations(1) 4516 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4517 } 4518 } 4519 } 4520 } 4521 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,strided_cm_subtile)4522 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, strided_cm_subtile) { 4523 TEST_REQUIRES_ARM_NEON_FMA; 4524 for (size_t k = 1; k <= 40; k += 9) { 4525 for (uint32_t n = 1; n <= 4; n++) { 4526 for (uint32_t m = 1; m <= 5; m++) { 4527 GemmMicrokernelTester() 4528 .mr(5) 4529 .nr(4) 4530 .kr(8) 4531 .sr(1) 4532 .m(m) 4533 .n(n) 4534 .k(k) 4535 .cm_stride(7) 4536 .iterations(1) 4537 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4538 } 4539 } 4540 } 4541 } 4542 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,qmin)4543 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, qmin) { 4544 TEST_REQUIRES_ARM_NEON_FMA; 4545 GemmMicrokernelTester() 4546 .mr(5) 4547 .nr(4) 4548 .kr(8) 4549 .sr(1) 4550 .m(5) 4551 .n(4) 4552 .k(8) 4553 .qmin(128) 4554 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4555 } 4556 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,qmax)4557 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, qmax) { 4558 TEST_REQUIRES_ARM_NEON_FMA; 4559 GemmMicrokernelTester() 4560 .mr(5) 4561 .nr(4) 4562 .kr(8) 4563 .sr(1) 4564 .m(5) 4565 .n(4) 4566 .k(8) 4567 .qmax(128) 4568 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4569 } 4570 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP,strided_cm)4571 TEST(BF16_GEMM_MINMAX_5X4C8__NEONFMA_ZIP, strided_cm) { 4572 TEST_REQUIRES_ARM_NEON_FMA; 4573 GemmMicrokernelTester() 4574 .mr(5) 4575 .nr(4) 4576 .kr(8) 4577 .sr(1) 4578 .m(5) 4579 .n(4) 4580 .k(8) 4581 .cm_stride(7) 4582 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonfma_zip, xnn_init_bf16_minmax_scalar_params); 4583 } 4584 #endif // XNN_ARCH_ARM64 4585 4586 4587 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8)4588 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8) { 4589 TEST_REQUIRES_ARM_NEON_BF16; 4590 GemmMicrokernelTester() 4591 .mr(1) 4592 .nr(8) 4593 .kr(2) 4594 .sr(1) 4595 .m(1) 4596 .n(8) 4597 .k(8) 4598 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4599 } 4600 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cn)4601 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cn) { 4602 TEST_REQUIRES_ARM_NEON_BF16; 4603 GemmMicrokernelTester() 4604 .mr(1) 4605 .nr(8) 4606 .kr(2) 4607 .sr(1) 4608 .m(1) 4609 .n(8) 4610 .k(8) 4611 .cn_stride(11) 4612 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4613 } 4614 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_strided_a)4615 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_strided_a) { 4616 TEST_REQUIRES_ARM_NEON_BF16; 4617 GemmMicrokernelTester() 4618 .mr(1) 4619 .nr(8) 4620 .kr(2) 4621 .sr(1) 4622 .m(1) 4623 .n(8) 4624 .k(8) 4625 .a_stride(11) 4626 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4627 } 4628 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile)4629 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile) { 4630 TEST_REQUIRES_ARM_NEON_BF16; 4631 for (uint32_t n = 1; n <= 8; n++) { 4632 for (uint32_t m = 1; m <= 1; m++) { 4633 GemmMicrokernelTester() 4634 .mr(1) 4635 .nr(8) 4636 .kr(2) 4637 .sr(1) 4638 .m(m) 4639 .n(n) 4640 .k(8) 4641 .iterations(1) 4642 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4643 } 4644 } 4645 } 4646 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile_m)4647 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile_m) { 4648 TEST_REQUIRES_ARM_NEON_BF16; 4649 for (uint32_t m = 1; m <= 1; m++) { 4650 GemmMicrokernelTester() 4651 .mr(1) 4652 .nr(8) 4653 .kr(2) 4654 .sr(1) 4655 .m(m) 4656 .n(8) 4657 .k(8) 4658 .iterations(1) 4659 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4660 } 4661 } 4662 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile_n)4663 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile_n) { 4664 TEST_REQUIRES_ARM_NEON_BF16; 4665 for (uint32_t n = 1; n <= 8; n++) { 4666 GemmMicrokernelTester() 4667 .mr(1) 4668 .nr(8) 4669 .kr(2) 4670 .sr(1) 4671 .m(1) 4672 .n(n) 4673 .k(8) 4674 .iterations(1) 4675 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4676 } 4677 } 4678 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8)4679 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8) { 4680 TEST_REQUIRES_ARM_NEON_BF16; 4681 for (size_t k = 1; k < 8; k++) { 4682 GemmMicrokernelTester() 4683 .mr(1) 4684 .nr(8) 4685 .kr(2) 4686 .sr(1) 4687 .m(1) 4688 .n(8) 4689 .k(k) 4690 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4691 } 4692 } 4693 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8_strided_a)4694 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8_strided_a) { 4695 TEST_REQUIRES_ARM_NEON_BF16; 4696 for (size_t k = 1; k < 8; k++) { 4697 GemmMicrokernelTester() 4698 .mr(1) 4699 .nr(8) 4700 .kr(2) 4701 .sr(1) 4702 .m(1) 4703 .n(8) 4704 .k(k) 4705 .a_stride(11) 4706 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4707 } 4708 } 4709 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8_subtile)4710 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8_subtile) { 4711 TEST_REQUIRES_ARM_NEON_BF16; 4712 for (size_t k = 1; k < 8; k++) { 4713 for (uint32_t n = 1; n <= 8; n++) { 4714 for (uint32_t m = 1; m <= 1; m++) { 4715 GemmMicrokernelTester() 4716 .mr(1) 4717 .nr(8) 4718 .kr(2) 4719 .sr(1) 4720 .m(m) 4721 .n(n) 4722 .k(k) 4723 .iterations(1) 4724 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4725 } 4726 } 4727 } 4728 } 4729 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8)4730 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8) { 4731 TEST_REQUIRES_ARM_NEON_BF16; 4732 for (size_t k = 9; k < 16; k++) { 4733 GemmMicrokernelTester() 4734 .mr(1) 4735 .nr(8) 4736 .kr(2) 4737 .sr(1) 4738 .m(1) 4739 .n(8) 4740 .k(k) 4741 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4742 } 4743 } 4744 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8_strided_a)4745 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8_strided_a) { 4746 TEST_REQUIRES_ARM_NEON_BF16; 4747 for (size_t k = 9; k < 16; k++) { 4748 GemmMicrokernelTester() 4749 .mr(1) 4750 .nr(8) 4751 .kr(2) 4752 .sr(1) 4753 .m(1) 4754 .n(8) 4755 .k(k) 4756 .a_stride(19) 4757 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4758 } 4759 } 4760 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8_subtile)4761 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8_subtile) { 4762 TEST_REQUIRES_ARM_NEON_BF16; 4763 for (size_t k = 9; k < 16; k++) { 4764 for (uint32_t n = 1; n <= 8; n++) { 4765 for (uint32_t m = 1; m <= 1; m++) { 4766 GemmMicrokernelTester() 4767 .mr(1) 4768 .nr(8) 4769 .kr(2) 4770 .sr(1) 4771 .m(m) 4772 .n(n) 4773 .k(k) 4774 .iterations(1) 4775 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4776 } 4777 } 4778 } 4779 } 4780 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8)4781 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8) { 4782 TEST_REQUIRES_ARM_NEON_BF16; 4783 for (size_t k = 16; k <= 80; k += 8) { 4784 GemmMicrokernelTester() 4785 .mr(1) 4786 .nr(8) 4787 .kr(2) 4788 .sr(1) 4789 .m(1) 4790 .n(8) 4791 .k(k) 4792 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4793 } 4794 } 4795 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8_strided_a)4796 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8_strided_a) { 4797 TEST_REQUIRES_ARM_NEON_BF16; 4798 for (size_t k = 16; k <= 80; k += 8) { 4799 GemmMicrokernelTester() 4800 .mr(1) 4801 .nr(8) 4802 .kr(2) 4803 .sr(1) 4804 .m(1) 4805 .n(8) 4806 .k(k) 4807 .a_stride(83) 4808 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4809 } 4810 } 4811 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8_subtile)4812 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8_subtile) { 4813 TEST_REQUIRES_ARM_NEON_BF16; 4814 for (size_t k = 16; k <= 80; k += 8) { 4815 for (uint32_t n = 1; n <= 8; n++) { 4816 for (uint32_t m = 1; m <= 1; m++) { 4817 GemmMicrokernelTester() 4818 .mr(1) 4819 .nr(8) 4820 .kr(2) 4821 .sr(1) 4822 .m(m) 4823 .n(n) 4824 .k(k) 4825 .iterations(1) 4826 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4827 } 4828 } 4829 } 4830 } 4831 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8)4832 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8) { 4833 TEST_REQUIRES_ARM_NEON_BF16; 4834 for (uint32_t n = 9; n < 16; n++) { 4835 for (size_t k = 1; k <= 40; k += 9) { 4836 GemmMicrokernelTester() 4837 .mr(1) 4838 .nr(8) 4839 .kr(2) 4840 .sr(1) 4841 .m(1) 4842 .n(n) 4843 .k(k) 4844 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4845 } 4846 } 4847 } 4848 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_strided_cn)4849 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_strided_cn) { 4850 TEST_REQUIRES_ARM_NEON_BF16; 4851 for (uint32_t n = 9; n < 16; n++) { 4852 for (size_t k = 1; k <= 40; k += 9) { 4853 GemmMicrokernelTester() 4854 .mr(1) 4855 .nr(8) 4856 .kr(2) 4857 .sr(1) 4858 .m(1) 4859 .n(n) 4860 .k(k) 4861 .cn_stride(11) 4862 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4863 } 4864 } 4865 } 4866 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_strided_a)4867 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_strided_a) { 4868 TEST_REQUIRES_ARM_NEON_BF16; 4869 for (uint32_t n = 9; n < 16; n++) { 4870 for (size_t k = 1; k <= 40; k += 9) { 4871 GemmMicrokernelTester() 4872 .mr(1) 4873 .nr(8) 4874 .kr(2) 4875 .sr(1) 4876 .m(1) 4877 .n(n) 4878 .k(k) 4879 .a_stride(43) 4880 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4881 } 4882 } 4883 } 4884 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_subtile)4885 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_subtile) { 4886 TEST_REQUIRES_ARM_NEON_BF16; 4887 for (uint32_t n = 9; n < 16; n++) { 4888 for (size_t k = 1; k <= 40; k += 9) { 4889 for (uint32_t m = 1; m <= 1; m++) { 4890 GemmMicrokernelTester() 4891 .mr(1) 4892 .nr(8) 4893 .kr(2) 4894 .sr(1) 4895 .m(m) 4896 .n(n) 4897 .k(k) 4898 .iterations(1) 4899 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4900 } 4901 } 4902 } 4903 } 4904 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8)4905 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8) { 4906 TEST_REQUIRES_ARM_NEON_BF16; 4907 for (uint32_t n = 16; n <= 24; n += 8) { 4908 for (size_t k = 1; k <= 40; k += 9) { 4909 GemmMicrokernelTester() 4910 .mr(1) 4911 .nr(8) 4912 .kr(2) 4913 .sr(1) 4914 .m(1) 4915 .n(n) 4916 .k(k) 4917 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4918 } 4919 } 4920 } 4921 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_strided_cn)4922 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_strided_cn) { 4923 TEST_REQUIRES_ARM_NEON_BF16; 4924 for (uint32_t n = 16; n <= 24; n += 8) { 4925 for (size_t k = 1; k <= 40; k += 9) { 4926 GemmMicrokernelTester() 4927 .mr(1) 4928 .nr(8) 4929 .kr(2) 4930 .sr(1) 4931 .m(1) 4932 .n(n) 4933 .k(k) 4934 .cn_stride(11) 4935 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4936 } 4937 } 4938 } 4939 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_strided_a)4940 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_strided_a) { 4941 TEST_REQUIRES_ARM_NEON_BF16; 4942 for (uint32_t n = 16; n <= 24; n += 8) { 4943 for (size_t k = 1; k <= 40; k += 9) { 4944 GemmMicrokernelTester() 4945 .mr(1) 4946 .nr(8) 4947 .kr(2) 4948 .sr(1) 4949 .m(1) 4950 .n(n) 4951 .k(k) 4952 .a_stride(43) 4953 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4954 } 4955 } 4956 } 4957 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_subtile)4958 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_subtile) { 4959 TEST_REQUIRES_ARM_NEON_BF16; 4960 for (uint32_t n = 16; n <= 24; n += 8) { 4961 for (size_t k = 1; k <= 40; k += 9) { 4962 for (uint32_t m = 1; m <= 1; m++) { 4963 GemmMicrokernelTester() 4964 .mr(1) 4965 .nr(8) 4966 .kr(2) 4967 .sr(1) 4968 .m(m) 4969 .n(n) 4970 .k(k) 4971 .iterations(1) 4972 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4973 } 4974 } 4975 } 4976 } 4977 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cm_subtile)4978 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cm_subtile) { 4979 TEST_REQUIRES_ARM_NEON_BF16; 4980 for (size_t k = 1; k <= 40; k += 9) { 4981 for (uint32_t n = 1; n <= 8; n++) { 4982 for (uint32_t m = 1; m <= 1; m++) { 4983 GemmMicrokernelTester() 4984 .mr(1) 4985 .nr(8) 4986 .kr(2) 4987 .sr(1) 4988 .m(m) 4989 .n(n) 4990 .k(k) 4991 .cm_stride(11) 4992 .iterations(1) 4993 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 4994 } 4995 } 4996 } 4997 } 4998 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,qmin)4999 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, qmin) { 5000 TEST_REQUIRES_ARM_NEON_BF16; 5001 GemmMicrokernelTester() 5002 .mr(1) 5003 .nr(8) 5004 .kr(2) 5005 .sr(1) 5006 .m(1) 5007 .n(8) 5008 .k(8) 5009 .qmin(128) 5010 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5011 } 5012 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,qmax)5013 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, qmax) { 5014 TEST_REQUIRES_ARM_NEON_BF16; 5015 GemmMicrokernelTester() 5016 .mr(1) 5017 .nr(8) 5018 .kr(2) 5019 .sr(1) 5020 .m(1) 5021 .n(8) 5022 .k(8) 5023 .qmax(128) 5024 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5025 } 5026 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cm)5027 TEST(BF16_GEMM_MINMAX_1X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cm) { 5028 TEST_REQUIRES_ARM_NEON_BF16; 5029 GemmMicrokernelTester() 5030 .mr(1) 5031 .nr(8) 5032 .kr(2) 5033 .sr(1) 5034 .m(1) 5035 .n(8) 5036 .k(8) 5037 .cm_stride(11) 5038 .Test(xnn_bf16_gemm_minmax_ukernel_1x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5039 } 5040 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 5041 5042 5043 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8)5044 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8) { 5045 TEST_REQUIRES_ARM_NEON_BF16; 5046 GemmMicrokernelTester() 5047 .mr(4) 5048 .nr(8) 5049 .kr(2) 5050 .sr(1) 5051 .m(4) 5052 .n(8) 5053 .k(8) 5054 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5055 } 5056 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cn)5057 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cn) { 5058 TEST_REQUIRES_ARM_NEON_BF16; 5059 GemmMicrokernelTester() 5060 .mr(4) 5061 .nr(8) 5062 .kr(2) 5063 .sr(1) 5064 .m(4) 5065 .n(8) 5066 .k(8) 5067 .cn_stride(11) 5068 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5069 } 5070 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_strided_a)5071 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_strided_a) { 5072 TEST_REQUIRES_ARM_NEON_BF16; 5073 GemmMicrokernelTester() 5074 .mr(4) 5075 .nr(8) 5076 .kr(2) 5077 .sr(1) 5078 .m(4) 5079 .n(8) 5080 .k(8) 5081 .a_stride(11) 5082 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5083 } 5084 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile)5085 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile) { 5086 TEST_REQUIRES_ARM_NEON_BF16; 5087 for (uint32_t n = 1; n <= 8; n++) { 5088 for (uint32_t m = 1; m <= 4; m++) { 5089 GemmMicrokernelTester() 5090 .mr(4) 5091 .nr(8) 5092 .kr(2) 5093 .sr(1) 5094 .m(m) 5095 .n(n) 5096 .k(8) 5097 .iterations(1) 5098 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5099 } 5100 } 5101 } 5102 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile_m)5103 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile_m) { 5104 TEST_REQUIRES_ARM_NEON_BF16; 5105 for (uint32_t m = 1; m <= 4; m++) { 5106 GemmMicrokernelTester() 5107 .mr(4) 5108 .nr(8) 5109 .kr(2) 5110 .sr(1) 5111 .m(m) 5112 .n(8) 5113 .k(8) 5114 .iterations(1) 5115 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5116 } 5117 } 5118 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile_n)5119 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile_n) { 5120 TEST_REQUIRES_ARM_NEON_BF16; 5121 for (uint32_t n = 1; n <= 8; n++) { 5122 GemmMicrokernelTester() 5123 .mr(4) 5124 .nr(8) 5125 .kr(2) 5126 .sr(1) 5127 .m(4) 5128 .n(n) 5129 .k(8) 5130 .iterations(1) 5131 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5132 } 5133 } 5134 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8)5135 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8) { 5136 TEST_REQUIRES_ARM_NEON_BF16; 5137 for (size_t k = 1; k < 8; k++) { 5138 GemmMicrokernelTester() 5139 .mr(4) 5140 .nr(8) 5141 .kr(2) 5142 .sr(1) 5143 .m(4) 5144 .n(8) 5145 .k(k) 5146 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5147 } 5148 } 5149 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8_strided_a)5150 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8_strided_a) { 5151 TEST_REQUIRES_ARM_NEON_BF16; 5152 for (size_t k = 1; k < 8; k++) { 5153 GemmMicrokernelTester() 5154 .mr(4) 5155 .nr(8) 5156 .kr(2) 5157 .sr(1) 5158 .m(4) 5159 .n(8) 5160 .k(k) 5161 .a_stride(11) 5162 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5163 } 5164 } 5165 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8_subtile)5166 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8_subtile) { 5167 TEST_REQUIRES_ARM_NEON_BF16; 5168 for (size_t k = 1; k < 8; k++) { 5169 for (uint32_t n = 1; n <= 8; n++) { 5170 for (uint32_t m = 1; m <= 4; m++) { 5171 GemmMicrokernelTester() 5172 .mr(4) 5173 .nr(8) 5174 .kr(2) 5175 .sr(1) 5176 .m(m) 5177 .n(n) 5178 .k(k) 5179 .iterations(1) 5180 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5181 } 5182 } 5183 } 5184 } 5185 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8)5186 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8) { 5187 TEST_REQUIRES_ARM_NEON_BF16; 5188 for (size_t k = 9; k < 16; k++) { 5189 GemmMicrokernelTester() 5190 .mr(4) 5191 .nr(8) 5192 .kr(2) 5193 .sr(1) 5194 .m(4) 5195 .n(8) 5196 .k(k) 5197 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5198 } 5199 } 5200 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8_strided_a)5201 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8_strided_a) { 5202 TEST_REQUIRES_ARM_NEON_BF16; 5203 for (size_t k = 9; k < 16; k++) { 5204 GemmMicrokernelTester() 5205 .mr(4) 5206 .nr(8) 5207 .kr(2) 5208 .sr(1) 5209 .m(4) 5210 .n(8) 5211 .k(k) 5212 .a_stride(19) 5213 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5214 } 5215 } 5216 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8_subtile)5217 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8_subtile) { 5218 TEST_REQUIRES_ARM_NEON_BF16; 5219 for (size_t k = 9; k < 16; k++) { 5220 for (uint32_t n = 1; n <= 8; n++) { 5221 for (uint32_t m = 1; m <= 4; m++) { 5222 GemmMicrokernelTester() 5223 .mr(4) 5224 .nr(8) 5225 .kr(2) 5226 .sr(1) 5227 .m(m) 5228 .n(n) 5229 .k(k) 5230 .iterations(1) 5231 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5232 } 5233 } 5234 } 5235 } 5236 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8)5237 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8) { 5238 TEST_REQUIRES_ARM_NEON_BF16; 5239 for (size_t k = 16; k <= 80; k += 8) { 5240 GemmMicrokernelTester() 5241 .mr(4) 5242 .nr(8) 5243 .kr(2) 5244 .sr(1) 5245 .m(4) 5246 .n(8) 5247 .k(k) 5248 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5249 } 5250 } 5251 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8_strided_a)5252 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8_strided_a) { 5253 TEST_REQUIRES_ARM_NEON_BF16; 5254 for (size_t k = 16; k <= 80; k += 8) { 5255 GemmMicrokernelTester() 5256 .mr(4) 5257 .nr(8) 5258 .kr(2) 5259 .sr(1) 5260 .m(4) 5261 .n(8) 5262 .k(k) 5263 .a_stride(83) 5264 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5265 } 5266 } 5267 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8_subtile)5268 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8_subtile) { 5269 TEST_REQUIRES_ARM_NEON_BF16; 5270 for (size_t k = 16; k <= 80; k += 8) { 5271 for (uint32_t n = 1; n <= 8; n++) { 5272 for (uint32_t m = 1; m <= 4; m++) { 5273 GemmMicrokernelTester() 5274 .mr(4) 5275 .nr(8) 5276 .kr(2) 5277 .sr(1) 5278 .m(m) 5279 .n(n) 5280 .k(k) 5281 .iterations(1) 5282 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5283 } 5284 } 5285 } 5286 } 5287 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8)5288 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8) { 5289 TEST_REQUIRES_ARM_NEON_BF16; 5290 for (uint32_t n = 9; n < 16; n++) { 5291 for (size_t k = 1; k <= 40; k += 9) { 5292 GemmMicrokernelTester() 5293 .mr(4) 5294 .nr(8) 5295 .kr(2) 5296 .sr(1) 5297 .m(4) 5298 .n(n) 5299 .k(k) 5300 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5301 } 5302 } 5303 } 5304 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_strided_cn)5305 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_strided_cn) { 5306 TEST_REQUIRES_ARM_NEON_BF16; 5307 for (uint32_t n = 9; n < 16; n++) { 5308 for (size_t k = 1; k <= 40; k += 9) { 5309 GemmMicrokernelTester() 5310 .mr(4) 5311 .nr(8) 5312 .kr(2) 5313 .sr(1) 5314 .m(4) 5315 .n(n) 5316 .k(k) 5317 .cn_stride(11) 5318 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5319 } 5320 } 5321 } 5322 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_strided_a)5323 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_strided_a) { 5324 TEST_REQUIRES_ARM_NEON_BF16; 5325 for (uint32_t n = 9; n < 16; n++) { 5326 for (size_t k = 1; k <= 40; k += 9) { 5327 GemmMicrokernelTester() 5328 .mr(4) 5329 .nr(8) 5330 .kr(2) 5331 .sr(1) 5332 .m(4) 5333 .n(n) 5334 .k(k) 5335 .a_stride(43) 5336 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5337 } 5338 } 5339 } 5340 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_subtile)5341 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_subtile) { 5342 TEST_REQUIRES_ARM_NEON_BF16; 5343 for (uint32_t n = 9; n < 16; n++) { 5344 for (size_t k = 1; k <= 40; k += 9) { 5345 for (uint32_t m = 1; m <= 4; m++) { 5346 GemmMicrokernelTester() 5347 .mr(4) 5348 .nr(8) 5349 .kr(2) 5350 .sr(1) 5351 .m(m) 5352 .n(n) 5353 .k(k) 5354 .iterations(1) 5355 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5356 } 5357 } 5358 } 5359 } 5360 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8)5361 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8) { 5362 TEST_REQUIRES_ARM_NEON_BF16; 5363 for (uint32_t n = 16; n <= 24; n += 8) { 5364 for (size_t k = 1; k <= 40; k += 9) { 5365 GemmMicrokernelTester() 5366 .mr(4) 5367 .nr(8) 5368 .kr(2) 5369 .sr(1) 5370 .m(4) 5371 .n(n) 5372 .k(k) 5373 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5374 } 5375 } 5376 } 5377 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_strided_cn)5378 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_strided_cn) { 5379 TEST_REQUIRES_ARM_NEON_BF16; 5380 for (uint32_t n = 16; n <= 24; n += 8) { 5381 for (size_t k = 1; k <= 40; k += 9) { 5382 GemmMicrokernelTester() 5383 .mr(4) 5384 .nr(8) 5385 .kr(2) 5386 .sr(1) 5387 .m(4) 5388 .n(n) 5389 .k(k) 5390 .cn_stride(11) 5391 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5392 } 5393 } 5394 } 5395 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_strided_a)5396 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_strided_a) { 5397 TEST_REQUIRES_ARM_NEON_BF16; 5398 for (uint32_t n = 16; n <= 24; n += 8) { 5399 for (size_t k = 1; k <= 40; k += 9) { 5400 GemmMicrokernelTester() 5401 .mr(4) 5402 .nr(8) 5403 .kr(2) 5404 .sr(1) 5405 .m(4) 5406 .n(n) 5407 .k(k) 5408 .a_stride(43) 5409 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5410 } 5411 } 5412 } 5413 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_subtile)5414 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_subtile) { 5415 TEST_REQUIRES_ARM_NEON_BF16; 5416 for (uint32_t n = 16; n <= 24; n += 8) { 5417 for (size_t k = 1; k <= 40; k += 9) { 5418 for (uint32_t m = 1; m <= 4; m++) { 5419 GemmMicrokernelTester() 5420 .mr(4) 5421 .nr(8) 5422 .kr(2) 5423 .sr(1) 5424 .m(m) 5425 .n(n) 5426 .k(k) 5427 .iterations(1) 5428 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5429 } 5430 } 5431 } 5432 } 5433 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cm_subtile)5434 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cm_subtile) { 5435 TEST_REQUIRES_ARM_NEON_BF16; 5436 for (size_t k = 1; k <= 40; k += 9) { 5437 for (uint32_t n = 1; n <= 8; n++) { 5438 for (uint32_t m = 1; m <= 4; m++) { 5439 GemmMicrokernelTester() 5440 .mr(4) 5441 .nr(8) 5442 .kr(2) 5443 .sr(1) 5444 .m(m) 5445 .n(n) 5446 .k(k) 5447 .cm_stride(11) 5448 .iterations(1) 5449 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5450 } 5451 } 5452 } 5453 } 5454 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,qmin)5455 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, qmin) { 5456 TEST_REQUIRES_ARM_NEON_BF16; 5457 GemmMicrokernelTester() 5458 .mr(4) 5459 .nr(8) 5460 .kr(2) 5461 .sr(1) 5462 .m(4) 5463 .n(8) 5464 .k(8) 5465 .qmin(128) 5466 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5467 } 5468 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,qmax)5469 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, qmax) { 5470 TEST_REQUIRES_ARM_NEON_BF16; 5471 GemmMicrokernelTester() 5472 .mr(4) 5473 .nr(8) 5474 .kr(2) 5475 .sr(1) 5476 .m(4) 5477 .n(8) 5478 .k(8) 5479 .qmax(128) 5480 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5481 } 5482 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cm)5483 TEST(BF16_GEMM_MINMAX_4X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cm) { 5484 TEST_REQUIRES_ARM_NEON_BF16; 5485 GemmMicrokernelTester() 5486 .mr(4) 5487 .nr(8) 5488 .kr(2) 5489 .sr(1) 5490 .m(4) 5491 .n(8) 5492 .k(8) 5493 .cm_stride(11) 5494 .Test(xnn_bf16_gemm_minmax_ukernel_4x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5495 } 5496 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 5497 5498 5499 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8)5500 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8) { 5501 TEST_REQUIRES_ARM_NEON_BF16; 5502 GemmMicrokernelTester() 5503 .mr(5) 5504 .nr(8) 5505 .kr(2) 5506 .sr(1) 5507 .m(5) 5508 .n(8) 5509 .k(8) 5510 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5511 } 5512 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cn)5513 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cn) { 5514 TEST_REQUIRES_ARM_NEON_BF16; 5515 GemmMicrokernelTester() 5516 .mr(5) 5517 .nr(8) 5518 .kr(2) 5519 .sr(1) 5520 .m(5) 5521 .n(8) 5522 .k(8) 5523 .cn_stride(11) 5524 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5525 } 5526 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_strided_a)5527 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_strided_a) { 5528 TEST_REQUIRES_ARM_NEON_BF16; 5529 GemmMicrokernelTester() 5530 .mr(5) 5531 .nr(8) 5532 .kr(2) 5533 .sr(1) 5534 .m(5) 5535 .n(8) 5536 .k(8) 5537 .a_stride(11) 5538 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5539 } 5540 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile)5541 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile) { 5542 TEST_REQUIRES_ARM_NEON_BF16; 5543 for (uint32_t n = 1; n <= 8; n++) { 5544 for (uint32_t m = 1; m <= 5; m++) { 5545 GemmMicrokernelTester() 5546 .mr(5) 5547 .nr(8) 5548 .kr(2) 5549 .sr(1) 5550 .m(m) 5551 .n(n) 5552 .k(8) 5553 .iterations(1) 5554 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5555 } 5556 } 5557 } 5558 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile_m)5559 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile_m) { 5560 TEST_REQUIRES_ARM_NEON_BF16; 5561 for (uint32_t m = 1; m <= 5; m++) { 5562 GemmMicrokernelTester() 5563 .mr(5) 5564 .nr(8) 5565 .kr(2) 5566 .sr(1) 5567 .m(m) 5568 .n(8) 5569 .k(8) 5570 .iterations(1) 5571 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5572 } 5573 } 5574 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile_n)5575 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile_n) { 5576 TEST_REQUIRES_ARM_NEON_BF16; 5577 for (uint32_t n = 1; n <= 8; n++) { 5578 GemmMicrokernelTester() 5579 .mr(5) 5580 .nr(8) 5581 .kr(2) 5582 .sr(1) 5583 .m(5) 5584 .n(n) 5585 .k(8) 5586 .iterations(1) 5587 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5588 } 5589 } 5590 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8)5591 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8) { 5592 TEST_REQUIRES_ARM_NEON_BF16; 5593 for (size_t k = 1; k < 8; k++) { 5594 GemmMicrokernelTester() 5595 .mr(5) 5596 .nr(8) 5597 .kr(2) 5598 .sr(1) 5599 .m(5) 5600 .n(8) 5601 .k(k) 5602 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5603 } 5604 } 5605 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8_strided_a)5606 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8_strided_a) { 5607 TEST_REQUIRES_ARM_NEON_BF16; 5608 for (size_t k = 1; k < 8; k++) { 5609 GemmMicrokernelTester() 5610 .mr(5) 5611 .nr(8) 5612 .kr(2) 5613 .sr(1) 5614 .m(5) 5615 .n(8) 5616 .k(k) 5617 .a_stride(11) 5618 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5619 } 5620 } 5621 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8_subtile)5622 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8_subtile) { 5623 TEST_REQUIRES_ARM_NEON_BF16; 5624 for (size_t k = 1; k < 8; k++) { 5625 for (uint32_t n = 1; n <= 8; n++) { 5626 for (uint32_t m = 1; m <= 5; m++) { 5627 GemmMicrokernelTester() 5628 .mr(5) 5629 .nr(8) 5630 .kr(2) 5631 .sr(1) 5632 .m(m) 5633 .n(n) 5634 .k(k) 5635 .iterations(1) 5636 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5637 } 5638 } 5639 } 5640 } 5641 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8)5642 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8) { 5643 TEST_REQUIRES_ARM_NEON_BF16; 5644 for (size_t k = 9; k < 16; k++) { 5645 GemmMicrokernelTester() 5646 .mr(5) 5647 .nr(8) 5648 .kr(2) 5649 .sr(1) 5650 .m(5) 5651 .n(8) 5652 .k(k) 5653 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5654 } 5655 } 5656 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8_strided_a)5657 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8_strided_a) { 5658 TEST_REQUIRES_ARM_NEON_BF16; 5659 for (size_t k = 9; k < 16; k++) { 5660 GemmMicrokernelTester() 5661 .mr(5) 5662 .nr(8) 5663 .kr(2) 5664 .sr(1) 5665 .m(5) 5666 .n(8) 5667 .k(k) 5668 .a_stride(19) 5669 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5670 } 5671 } 5672 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8_subtile)5673 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8_subtile) { 5674 TEST_REQUIRES_ARM_NEON_BF16; 5675 for (size_t k = 9; k < 16; k++) { 5676 for (uint32_t n = 1; n <= 8; n++) { 5677 for (uint32_t m = 1; m <= 5; m++) { 5678 GemmMicrokernelTester() 5679 .mr(5) 5680 .nr(8) 5681 .kr(2) 5682 .sr(1) 5683 .m(m) 5684 .n(n) 5685 .k(k) 5686 .iterations(1) 5687 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5688 } 5689 } 5690 } 5691 } 5692 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8)5693 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8) { 5694 TEST_REQUIRES_ARM_NEON_BF16; 5695 for (size_t k = 16; k <= 80; k += 8) { 5696 GemmMicrokernelTester() 5697 .mr(5) 5698 .nr(8) 5699 .kr(2) 5700 .sr(1) 5701 .m(5) 5702 .n(8) 5703 .k(k) 5704 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5705 } 5706 } 5707 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8_strided_a)5708 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8_strided_a) { 5709 TEST_REQUIRES_ARM_NEON_BF16; 5710 for (size_t k = 16; k <= 80; k += 8) { 5711 GemmMicrokernelTester() 5712 .mr(5) 5713 .nr(8) 5714 .kr(2) 5715 .sr(1) 5716 .m(5) 5717 .n(8) 5718 .k(k) 5719 .a_stride(83) 5720 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5721 } 5722 } 5723 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8_subtile)5724 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8_subtile) { 5725 TEST_REQUIRES_ARM_NEON_BF16; 5726 for (size_t k = 16; k <= 80; k += 8) { 5727 for (uint32_t n = 1; n <= 8; n++) { 5728 for (uint32_t m = 1; m <= 5; m++) { 5729 GemmMicrokernelTester() 5730 .mr(5) 5731 .nr(8) 5732 .kr(2) 5733 .sr(1) 5734 .m(m) 5735 .n(n) 5736 .k(k) 5737 .iterations(1) 5738 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5739 } 5740 } 5741 } 5742 } 5743 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8)5744 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8) { 5745 TEST_REQUIRES_ARM_NEON_BF16; 5746 for (uint32_t n = 9; n < 16; n++) { 5747 for (size_t k = 1; k <= 40; k += 9) { 5748 GemmMicrokernelTester() 5749 .mr(5) 5750 .nr(8) 5751 .kr(2) 5752 .sr(1) 5753 .m(5) 5754 .n(n) 5755 .k(k) 5756 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5757 } 5758 } 5759 } 5760 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_strided_cn)5761 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_strided_cn) { 5762 TEST_REQUIRES_ARM_NEON_BF16; 5763 for (uint32_t n = 9; n < 16; n++) { 5764 for (size_t k = 1; k <= 40; k += 9) { 5765 GemmMicrokernelTester() 5766 .mr(5) 5767 .nr(8) 5768 .kr(2) 5769 .sr(1) 5770 .m(5) 5771 .n(n) 5772 .k(k) 5773 .cn_stride(11) 5774 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5775 } 5776 } 5777 } 5778 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_strided_a)5779 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_strided_a) { 5780 TEST_REQUIRES_ARM_NEON_BF16; 5781 for (uint32_t n = 9; n < 16; n++) { 5782 for (size_t k = 1; k <= 40; k += 9) { 5783 GemmMicrokernelTester() 5784 .mr(5) 5785 .nr(8) 5786 .kr(2) 5787 .sr(1) 5788 .m(5) 5789 .n(n) 5790 .k(k) 5791 .a_stride(43) 5792 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5793 } 5794 } 5795 } 5796 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_subtile)5797 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_subtile) { 5798 TEST_REQUIRES_ARM_NEON_BF16; 5799 for (uint32_t n = 9; n < 16; n++) { 5800 for (size_t k = 1; k <= 40; k += 9) { 5801 for (uint32_t m = 1; m <= 5; m++) { 5802 GemmMicrokernelTester() 5803 .mr(5) 5804 .nr(8) 5805 .kr(2) 5806 .sr(1) 5807 .m(m) 5808 .n(n) 5809 .k(k) 5810 .iterations(1) 5811 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5812 } 5813 } 5814 } 5815 } 5816 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8)5817 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8) { 5818 TEST_REQUIRES_ARM_NEON_BF16; 5819 for (uint32_t n = 16; n <= 24; n += 8) { 5820 for (size_t k = 1; k <= 40; k += 9) { 5821 GemmMicrokernelTester() 5822 .mr(5) 5823 .nr(8) 5824 .kr(2) 5825 .sr(1) 5826 .m(5) 5827 .n(n) 5828 .k(k) 5829 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5830 } 5831 } 5832 } 5833 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_strided_cn)5834 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_strided_cn) { 5835 TEST_REQUIRES_ARM_NEON_BF16; 5836 for (uint32_t n = 16; n <= 24; n += 8) { 5837 for (size_t k = 1; k <= 40; k += 9) { 5838 GemmMicrokernelTester() 5839 .mr(5) 5840 .nr(8) 5841 .kr(2) 5842 .sr(1) 5843 .m(5) 5844 .n(n) 5845 .k(k) 5846 .cn_stride(11) 5847 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5848 } 5849 } 5850 } 5851 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_strided_a)5852 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_strided_a) { 5853 TEST_REQUIRES_ARM_NEON_BF16; 5854 for (uint32_t n = 16; n <= 24; n += 8) { 5855 for (size_t k = 1; k <= 40; k += 9) { 5856 GemmMicrokernelTester() 5857 .mr(5) 5858 .nr(8) 5859 .kr(2) 5860 .sr(1) 5861 .m(5) 5862 .n(n) 5863 .k(k) 5864 .a_stride(43) 5865 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5866 } 5867 } 5868 } 5869 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_subtile)5870 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_subtile) { 5871 TEST_REQUIRES_ARM_NEON_BF16; 5872 for (uint32_t n = 16; n <= 24; n += 8) { 5873 for (size_t k = 1; k <= 40; k += 9) { 5874 for (uint32_t m = 1; m <= 5; m++) { 5875 GemmMicrokernelTester() 5876 .mr(5) 5877 .nr(8) 5878 .kr(2) 5879 .sr(1) 5880 .m(m) 5881 .n(n) 5882 .k(k) 5883 .iterations(1) 5884 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5885 } 5886 } 5887 } 5888 } 5889 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cm_subtile)5890 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cm_subtile) { 5891 TEST_REQUIRES_ARM_NEON_BF16; 5892 for (size_t k = 1; k <= 40; k += 9) { 5893 for (uint32_t n = 1; n <= 8; n++) { 5894 for (uint32_t m = 1; m <= 5; m++) { 5895 GemmMicrokernelTester() 5896 .mr(5) 5897 .nr(8) 5898 .kr(2) 5899 .sr(1) 5900 .m(m) 5901 .n(n) 5902 .k(k) 5903 .cm_stride(11) 5904 .iterations(1) 5905 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5906 } 5907 } 5908 } 5909 } 5910 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,qmin)5911 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, qmin) { 5912 TEST_REQUIRES_ARM_NEON_BF16; 5913 GemmMicrokernelTester() 5914 .mr(5) 5915 .nr(8) 5916 .kr(2) 5917 .sr(1) 5918 .m(5) 5919 .n(8) 5920 .k(8) 5921 .qmin(128) 5922 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5923 } 5924 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,qmax)5925 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, qmax) { 5926 TEST_REQUIRES_ARM_NEON_BF16; 5927 GemmMicrokernelTester() 5928 .mr(5) 5929 .nr(8) 5930 .kr(2) 5931 .sr(1) 5932 .m(5) 5933 .n(8) 5934 .k(8) 5935 .qmax(128) 5936 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5937 } 5938 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cm)5939 TEST(BF16_GEMM_MINMAX_5X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cm) { 5940 TEST_REQUIRES_ARM_NEON_BF16; 5941 GemmMicrokernelTester() 5942 .mr(5) 5943 .nr(8) 5944 .kr(2) 5945 .sr(1) 5946 .m(5) 5947 .n(8) 5948 .k(8) 5949 .cm_stride(11) 5950 .Test(xnn_bf16_gemm_minmax_ukernel_5x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5951 } 5952 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 5953 5954 5955 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8)5956 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8) { 5957 TEST_REQUIRES_ARM_NEON_BF16; 5958 GemmMicrokernelTester() 5959 .mr(6) 5960 .nr(8) 5961 .kr(2) 5962 .sr(1) 5963 .m(6) 5964 .n(8) 5965 .k(8) 5966 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5967 } 5968 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cn)5969 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cn) { 5970 TEST_REQUIRES_ARM_NEON_BF16; 5971 GemmMicrokernelTester() 5972 .mr(6) 5973 .nr(8) 5974 .kr(2) 5975 .sr(1) 5976 .m(6) 5977 .n(8) 5978 .k(8) 5979 .cn_stride(11) 5980 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5981 } 5982 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_strided_a)5983 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_strided_a) { 5984 TEST_REQUIRES_ARM_NEON_BF16; 5985 GemmMicrokernelTester() 5986 .mr(6) 5987 .nr(8) 5988 .kr(2) 5989 .sr(1) 5990 .m(6) 5991 .n(8) 5992 .k(8) 5993 .a_stride(11) 5994 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 5995 } 5996 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile)5997 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile) { 5998 TEST_REQUIRES_ARM_NEON_BF16; 5999 for (uint32_t n = 1; n <= 8; n++) { 6000 for (uint32_t m = 1; m <= 6; m++) { 6001 GemmMicrokernelTester() 6002 .mr(6) 6003 .nr(8) 6004 .kr(2) 6005 .sr(1) 6006 .m(m) 6007 .n(n) 6008 .k(8) 6009 .iterations(1) 6010 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6011 } 6012 } 6013 } 6014 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile_m)6015 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile_m) { 6016 TEST_REQUIRES_ARM_NEON_BF16; 6017 for (uint32_t m = 1; m <= 6; m++) { 6018 GemmMicrokernelTester() 6019 .mr(6) 6020 .nr(8) 6021 .kr(2) 6022 .sr(1) 6023 .m(m) 6024 .n(8) 6025 .k(8) 6026 .iterations(1) 6027 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6028 } 6029 } 6030 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_eq_8_subtile_n)6031 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_eq_8_subtile_n) { 6032 TEST_REQUIRES_ARM_NEON_BF16; 6033 for (uint32_t n = 1; n <= 8; n++) { 6034 GemmMicrokernelTester() 6035 .mr(6) 6036 .nr(8) 6037 .kr(2) 6038 .sr(1) 6039 .m(6) 6040 .n(n) 6041 .k(8) 6042 .iterations(1) 6043 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6044 } 6045 } 6046 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8)6047 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8) { 6048 TEST_REQUIRES_ARM_NEON_BF16; 6049 for (size_t k = 1; k < 8; k++) { 6050 GemmMicrokernelTester() 6051 .mr(6) 6052 .nr(8) 6053 .kr(2) 6054 .sr(1) 6055 .m(6) 6056 .n(8) 6057 .k(k) 6058 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6059 } 6060 } 6061 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8_strided_a)6062 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8_strided_a) { 6063 TEST_REQUIRES_ARM_NEON_BF16; 6064 for (size_t k = 1; k < 8; k++) { 6065 GemmMicrokernelTester() 6066 .mr(6) 6067 .nr(8) 6068 .kr(2) 6069 .sr(1) 6070 .m(6) 6071 .n(8) 6072 .k(k) 6073 .a_stride(11) 6074 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6075 } 6076 } 6077 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_lt_8_subtile)6078 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_lt_8_subtile) { 6079 TEST_REQUIRES_ARM_NEON_BF16; 6080 for (size_t k = 1; k < 8; k++) { 6081 for (uint32_t n = 1; n <= 8; n++) { 6082 for (uint32_t m = 1; m <= 6; m++) { 6083 GemmMicrokernelTester() 6084 .mr(6) 6085 .nr(8) 6086 .kr(2) 6087 .sr(1) 6088 .m(m) 6089 .n(n) 6090 .k(k) 6091 .iterations(1) 6092 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6093 } 6094 } 6095 } 6096 } 6097 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8)6098 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8) { 6099 TEST_REQUIRES_ARM_NEON_BF16; 6100 for (size_t k = 9; k < 16; k++) { 6101 GemmMicrokernelTester() 6102 .mr(6) 6103 .nr(8) 6104 .kr(2) 6105 .sr(1) 6106 .m(6) 6107 .n(8) 6108 .k(k) 6109 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6110 } 6111 } 6112 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8_strided_a)6113 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8_strided_a) { 6114 TEST_REQUIRES_ARM_NEON_BF16; 6115 for (size_t k = 9; k < 16; k++) { 6116 GemmMicrokernelTester() 6117 .mr(6) 6118 .nr(8) 6119 .kr(2) 6120 .sr(1) 6121 .m(6) 6122 .n(8) 6123 .k(k) 6124 .a_stride(19) 6125 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6126 } 6127 } 6128 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_gt_8_subtile)6129 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_gt_8_subtile) { 6130 TEST_REQUIRES_ARM_NEON_BF16; 6131 for (size_t k = 9; k < 16; k++) { 6132 for (uint32_t n = 1; n <= 8; n++) { 6133 for (uint32_t m = 1; m <= 6; m++) { 6134 GemmMicrokernelTester() 6135 .mr(6) 6136 .nr(8) 6137 .kr(2) 6138 .sr(1) 6139 .m(m) 6140 .n(n) 6141 .k(k) 6142 .iterations(1) 6143 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6144 } 6145 } 6146 } 6147 } 6148 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8)6149 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8) { 6150 TEST_REQUIRES_ARM_NEON_BF16; 6151 for (size_t k = 16; k <= 80; k += 8) { 6152 GemmMicrokernelTester() 6153 .mr(6) 6154 .nr(8) 6155 .kr(2) 6156 .sr(1) 6157 .m(6) 6158 .n(8) 6159 .k(k) 6160 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6161 } 6162 } 6163 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8_strided_a)6164 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8_strided_a) { 6165 TEST_REQUIRES_ARM_NEON_BF16; 6166 for (size_t k = 16; k <= 80; k += 8) { 6167 GemmMicrokernelTester() 6168 .mr(6) 6169 .nr(8) 6170 .kr(2) 6171 .sr(1) 6172 .m(6) 6173 .n(8) 6174 .k(k) 6175 .a_stride(83) 6176 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6177 } 6178 } 6179 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,k_div_8_subtile)6180 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, k_div_8_subtile) { 6181 TEST_REQUIRES_ARM_NEON_BF16; 6182 for (size_t k = 16; k <= 80; k += 8) { 6183 for (uint32_t n = 1; n <= 8; n++) { 6184 for (uint32_t m = 1; m <= 6; m++) { 6185 GemmMicrokernelTester() 6186 .mr(6) 6187 .nr(8) 6188 .kr(2) 6189 .sr(1) 6190 .m(m) 6191 .n(n) 6192 .k(k) 6193 .iterations(1) 6194 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6195 } 6196 } 6197 } 6198 } 6199 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8)6200 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8) { 6201 TEST_REQUIRES_ARM_NEON_BF16; 6202 for (uint32_t n = 9; n < 16; n++) { 6203 for (size_t k = 1; k <= 40; k += 9) { 6204 GemmMicrokernelTester() 6205 .mr(6) 6206 .nr(8) 6207 .kr(2) 6208 .sr(1) 6209 .m(6) 6210 .n(n) 6211 .k(k) 6212 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6213 } 6214 } 6215 } 6216 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_strided_cn)6217 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_strided_cn) { 6218 TEST_REQUIRES_ARM_NEON_BF16; 6219 for (uint32_t n = 9; n < 16; n++) { 6220 for (size_t k = 1; k <= 40; k += 9) { 6221 GemmMicrokernelTester() 6222 .mr(6) 6223 .nr(8) 6224 .kr(2) 6225 .sr(1) 6226 .m(6) 6227 .n(n) 6228 .k(k) 6229 .cn_stride(11) 6230 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6231 } 6232 } 6233 } 6234 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_strided_a)6235 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_strided_a) { 6236 TEST_REQUIRES_ARM_NEON_BF16; 6237 for (uint32_t n = 9; n < 16; n++) { 6238 for (size_t k = 1; k <= 40; k += 9) { 6239 GemmMicrokernelTester() 6240 .mr(6) 6241 .nr(8) 6242 .kr(2) 6243 .sr(1) 6244 .m(6) 6245 .n(n) 6246 .k(k) 6247 .a_stride(43) 6248 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6249 } 6250 } 6251 } 6252 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,n_gt_8_subtile)6253 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, n_gt_8_subtile) { 6254 TEST_REQUIRES_ARM_NEON_BF16; 6255 for (uint32_t n = 9; n < 16; n++) { 6256 for (size_t k = 1; k <= 40; k += 9) { 6257 for (uint32_t m = 1; m <= 6; m++) { 6258 GemmMicrokernelTester() 6259 .mr(6) 6260 .nr(8) 6261 .kr(2) 6262 .sr(1) 6263 .m(m) 6264 .n(n) 6265 .k(k) 6266 .iterations(1) 6267 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6268 } 6269 } 6270 } 6271 } 6272 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8)6273 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8) { 6274 TEST_REQUIRES_ARM_NEON_BF16; 6275 for (uint32_t n = 16; n <= 24; n += 8) { 6276 for (size_t k = 1; k <= 40; k += 9) { 6277 GemmMicrokernelTester() 6278 .mr(6) 6279 .nr(8) 6280 .kr(2) 6281 .sr(1) 6282 .m(6) 6283 .n(n) 6284 .k(k) 6285 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6286 } 6287 } 6288 } 6289 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_strided_cn)6290 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_strided_cn) { 6291 TEST_REQUIRES_ARM_NEON_BF16; 6292 for (uint32_t n = 16; n <= 24; n += 8) { 6293 for (size_t k = 1; k <= 40; k += 9) { 6294 GemmMicrokernelTester() 6295 .mr(6) 6296 .nr(8) 6297 .kr(2) 6298 .sr(1) 6299 .m(6) 6300 .n(n) 6301 .k(k) 6302 .cn_stride(11) 6303 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6304 } 6305 } 6306 } 6307 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_strided_a)6308 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_strided_a) { 6309 TEST_REQUIRES_ARM_NEON_BF16; 6310 for (uint32_t n = 16; n <= 24; n += 8) { 6311 for (size_t k = 1; k <= 40; k += 9) { 6312 GemmMicrokernelTester() 6313 .mr(6) 6314 .nr(8) 6315 .kr(2) 6316 .sr(1) 6317 .m(6) 6318 .n(n) 6319 .k(k) 6320 .a_stride(43) 6321 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6322 } 6323 } 6324 } 6325 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,n_div_8_subtile)6326 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, n_div_8_subtile) { 6327 TEST_REQUIRES_ARM_NEON_BF16; 6328 for (uint32_t n = 16; n <= 24; n += 8) { 6329 for (size_t k = 1; k <= 40; k += 9) { 6330 for (uint32_t m = 1; m <= 6; m++) { 6331 GemmMicrokernelTester() 6332 .mr(6) 6333 .nr(8) 6334 .kr(2) 6335 .sr(1) 6336 .m(m) 6337 .n(n) 6338 .k(k) 6339 .iterations(1) 6340 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6341 } 6342 } 6343 } 6344 } 6345 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cm_subtile)6346 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cm_subtile) { 6347 TEST_REQUIRES_ARM_NEON_BF16; 6348 for (size_t k = 1; k <= 40; k += 9) { 6349 for (uint32_t n = 1; n <= 8; n++) { 6350 for (uint32_t m = 1; m <= 6; m++) { 6351 GemmMicrokernelTester() 6352 .mr(6) 6353 .nr(8) 6354 .kr(2) 6355 .sr(1) 6356 .m(m) 6357 .n(n) 6358 .k(k) 6359 .cm_stride(11) 6360 .iterations(1) 6361 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6362 } 6363 } 6364 } 6365 } 6366 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,qmin)6367 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, qmin) { 6368 TEST_REQUIRES_ARM_NEON_BF16; 6369 GemmMicrokernelTester() 6370 .mr(6) 6371 .nr(8) 6372 .kr(2) 6373 .sr(1) 6374 .m(6) 6375 .n(8) 6376 .k(8) 6377 .qmin(128) 6378 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6379 } 6380 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,qmax)6381 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, qmax) { 6382 TEST_REQUIRES_ARM_NEON_BF16; 6383 GemmMicrokernelTester() 6384 .mr(6) 6385 .nr(8) 6386 .kr(2) 6387 .sr(1) 6388 .m(6) 6389 .n(8) 6390 .k(8) 6391 .qmax(128) 6392 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6393 } 6394 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128,strided_cm)6395 TEST(BF16_GEMM_MINMAX_6X8C2__NEONBF16_BFDOT_LANE_LD128, strided_cm) { 6396 TEST_REQUIRES_ARM_NEON_BF16; 6397 GemmMicrokernelTester() 6398 .mr(6) 6399 .nr(8) 6400 .kr(2) 6401 .sr(1) 6402 .m(6) 6403 .n(8) 6404 .k(8) 6405 .cm_stride(11) 6406 .Test(xnn_bf16_gemm_minmax_ukernel_6x8c2__neonbf16_bfdot_lane_ld128, xnn_init_bf16_minmax_scalar_params); 6407 } 6408 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 6409 6410 6411 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_eq_8)6412 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_eq_8) { 6413 TEST_REQUIRES_ARM_NEON_BF16; 6414 GemmMicrokernelTester() 6415 .mr(1) 6416 .nr(4) 6417 .kr(8) 6418 .sr(1) 6419 .m(1) 6420 .n(4) 6421 .k(8) 6422 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6423 } 6424 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,strided_cn)6425 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, strided_cn) { 6426 TEST_REQUIRES_ARM_NEON_BF16; 6427 GemmMicrokernelTester() 6428 .mr(1) 6429 .nr(4) 6430 .kr(8) 6431 .sr(1) 6432 .m(1) 6433 .n(4) 6434 .k(8) 6435 .cn_stride(7) 6436 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6437 } 6438 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_eq_8_strided_a)6439 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_eq_8_strided_a) { 6440 TEST_REQUIRES_ARM_NEON_BF16; 6441 GemmMicrokernelTester() 6442 .mr(1) 6443 .nr(4) 6444 .kr(8) 6445 .sr(1) 6446 .m(1) 6447 .n(4) 6448 .k(8) 6449 .a_stride(11) 6450 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6451 } 6452 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_eq_8_subtile)6453 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_eq_8_subtile) { 6454 TEST_REQUIRES_ARM_NEON_BF16; 6455 for (uint32_t n = 1; n <= 4; n++) { 6456 for (uint32_t m = 1; m <= 1; m++) { 6457 GemmMicrokernelTester() 6458 .mr(1) 6459 .nr(4) 6460 .kr(8) 6461 .sr(1) 6462 .m(m) 6463 .n(n) 6464 .k(8) 6465 .iterations(1) 6466 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6467 } 6468 } 6469 } 6470 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_eq_8_subtile_m)6471 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_eq_8_subtile_m) { 6472 TEST_REQUIRES_ARM_NEON_BF16; 6473 for (uint32_t m = 1; m <= 1; m++) { 6474 GemmMicrokernelTester() 6475 .mr(1) 6476 .nr(4) 6477 .kr(8) 6478 .sr(1) 6479 .m(m) 6480 .n(4) 6481 .k(8) 6482 .iterations(1) 6483 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6484 } 6485 } 6486 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_eq_8_subtile_n)6487 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_eq_8_subtile_n) { 6488 TEST_REQUIRES_ARM_NEON_BF16; 6489 for (uint32_t n = 1; n <= 4; n++) { 6490 GemmMicrokernelTester() 6491 .mr(1) 6492 .nr(4) 6493 .kr(8) 6494 .sr(1) 6495 .m(1) 6496 .n(n) 6497 .k(8) 6498 .iterations(1) 6499 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6500 } 6501 } 6502 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_lt_8)6503 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_lt_8) { 6504 TEST_REQUIRES_ARM_NEON_BF16; 6505 for (size_t k = 1; k < 8; k++) { 6506 GemmMicrokernelTester() 6507 .mr(1) 6508 .nr(4) 6509 .kr(8) 6510 .sr(1) 6511 .m(1) 6512 .n(4) 6513 .k(k) 6514 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6515 } 6516 } 6517 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_lt_8_strided_a)6518 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_lt_8_strided_a) { 6519 TEST_REQUIRES_ARM_NEON_BF16; 6520 for (size_t k = 1; k < 8; k++) { 6521 GemmMicrokernelTester() 6522 .mr(1) 6523 .nr(4) 6524 .kr(8) 6525 .sr(1) 6526 .m(1) 6527 .n(4) 6528 .k(k) 6529 .a_stride(11) 6530 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6531 } 6532 } 6533 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_lt_8_subtile)6534 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_lt_8_subtile) { 6535 TEST_REQUIRES_ARM_NEON_BF16; 6536 for (size_t k = 1; k < 8; k++) { 6537 for (uint32_t n = 1; n <= 4; n++) { 6538 for (uint32_t m = 1; m <= 1; m++) { 6539 GemmMicrokernelTester() 6540 .mr(1) 6541 .nr(4) 6542 .kr(8) 6543 .sr(1) 6544 .m(m) 6545 .n(n) 6546 .k(k) 6547 .iterations(1) 6548 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6549 } 6550 } 6551 } 6552 } 6553 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_gt_8)6554 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_gt_8) { 6555 TEST_REQUIRES_ARM_NEON_BF16; 6556 for (size_t k = 9; k < 16; k++) { 6557 GemmMicrokernelTester() 6558 .mr(1) 6559 .nr(4) 6560 .kr(8) 6561 .sr(1) 6562 .m(1) 6563 .n(4) 6564 .k(k) 6565 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6566 } 6567 } 6568 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_gt_8_strided_a)6569 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_gt_8_strided_a) { 6570 TEST_REQUIRES_ARM_NEON_BF16; 6571 for (size_t k = 9; k < 16; k++) { 6572 GemmMicrokernelTester() 6573 .mr(1) 6574 .nr(4) 6575 .kr(8) 6576 .sr(1) 6577 .m(1) 6578 .n(4) 6579 .k(k) 6580 .a_stride(19) 6581 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6582 } 6583 } 6584 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_gt_8_subtile)6585 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_gt_8_subtile) { 6586 TEST_REQUIRES_ARM_NEON_BF16; 6587 for (size_t k = 9; k < 16; k++) { 6588 for (uint32_t n = 1; n <= 4; n++) { 6589 for (uint32_t m = 1; m <= 1; m++) { 6590 GemmMicrokernelTester() 6591 .mr(1) 6592 .nr(4) 6593 .kr(8) 6594 .sr(1) 6595 .m(m) 6596 .n(n) 6597 .k(k) 6598 .iterations(1) 6599 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6600 } 6601 } 6602 } 6603 } 6604 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_div_8)6605 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_div_8) { 6606 TEST_REQUIRES_ARM_NEON_BF16; 6607 for (size_t k = 16; k <= 80; k += 8) { 6608 GemmMicrokernelTester() 6609 .mr(1) 6610 .nr(4) 6611 .kr(8) 6612 .sr(1) 6613 .m(1) 6614 .n(4) 6615 .k(k) 6616 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6617 } 6618 } 6619 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_div_8_strided_a)6620 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_div_8_strided_a) { 6621 TEST_REQUIRES_ARM_NEON_BF16; 6622 for (size_t k = 16; k <= 80; k += 8) { 6623 GemmMicrokernelTester() 6624 .mr(1) 6625 .nr(4) 6626 .kr(8) 6627 .sr(1) 6628 .m(1) 6629 .n(4) 6630 .k(k) 6631 .a_stride(83) 6632 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6633 } 6634 } 6635 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,k_div_8_subtile)6636 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, k_div_8_subtile) { 6637 TEST_REQUIRES_ARM_NEON_BF16; 6638 for (size_t k = 16; k <= 80; k += 8) { 6639 for (uint32_t n = 1; n <= 4; n++) { 6640 for (uint32_t m = 1; m <= 1; m++) { 6641 GemmMicrokernelTester() 6642 .mr(1) 6643 .nr(4) 6644 .kr(8) 6645 .sr(1) 6646 .m(m) 6647 .n(n) 6648 .k(k) 6649 .iterations(1) 6650 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6651 } 6652 } 6653 } 6654 } 6655 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,n_gt_4)6656 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, n_gt_4) { 6657 TEST_REQUIRES_ARM_NEON_BF16; 6658 for (uint32_t n = 5; n < 8; n++) { 6659 for (size_t k = 1; k <= 40; k += 9) { 6660 GemmMicrokernelTester() 6661 .mr(1) 6662 .nr(4) 6663 .kr(8) 6664 .sr(1) 6665 .m(1) 6666 .n(n) 6667 .k(k) 6668 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6669 } 6670 } 6671 } 6672 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,n_gt_4_strided_cn)6673 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, n_gt_4_strided_cn) { 6674 TEST_REQUIRES_ARM_NEON_BF16; 6675 for (uint32_t n = 5; n < 8; n++) { 6676 for (size_t k = 1; k <= 40; k += 9) { 6677 GemmMicrokernelTester() 6678 .mr(1) 6679 .nr(4) 6680 .kr(8) 6681 .sr(1) 6682 .m(1) 6683 .n(n) 6684 .k(k) 6685 .cn_stride(7) 6686 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6687 } 6688 } 6689 } 6690 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,n_gt_4_strided_a)6691 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, n_gt_4_strided_a) { 6692 TEST_REQUIRES_ARM_NEON_BF16; 6693 for (uint32_t n = 5; n < 8; n++) { 6694 for (size_t k = 1; k <= 40; k += 9) { 6695 GemmMicrokernelTester() 6696 .mr(1) 6697 .nr(4) 6698 .kr(8) 6699 .sr(1) 6700 .m(1) 6701 .n(n) 6702 .k(k) 6703 .a_stride(43) 6704 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6705 } 6706 } 6707 } 6708 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,n_gt_4_subtile)6709 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, n_gt_4_subtile) { 6710 TEST_REQUIRES_ARM_NEON_BF16; 6711 for (uint32_t n = 5; n < 8; n++) { 6712 for (size_t k = 1; k <= 40; k += 9) { 6713 for (uint32_t m = 1; m <= 1; m++) { 6714 GemmMicrokernelTester() 6715 .mr(1) 6716 .nr(4) 6717 .kr(8) 6718 .sr(1) 6719 .m(m) 6720 .n(n) 6721 .k(k) 6722 .iterations(1) 6723 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6724 } 6725 } 6726 } 6727 } 6728 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,n_div_4)6729 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, n_div_4) { 6730 TEST_REQUIRES_ARM_NEON_BF16; 6731 for (uint32_t n = 8; n <= 12; n += 4) { 6732 for (size_t k = 1; k <= 40; k += 9) { 6733 GemmMicrokernelTester() 6734 .mr(1) 6735 .nr(4) 6736 .kr(8) 6737 .sr(1) 6738 .m(1) 6739 .n(n) 6740 .k(k) 6741 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6742 } 6743 } 6744 } 6745 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,n_div_4_strided_cn)6746 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, n_div_4_strided_cn) { 6747 TEST_REQUIRES_ARM_NEON_BF16; 6748 for (uint32_t n = 8; n <= 12; n += 4) { 6749 for (size_t k = 1; k <= 40; k += 9) { 6750 GemmMicrokernelTester() 6751 .mr(1) 6752 .nr(4) 6753 .kr(8) 6754 .sr(1) 6755 .m(1) 6756 .n(n) 6757 .k(k) 6758 .cn_stride(7) 6759 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6760 } 6761 } 6762 } 6763 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,n_div_4_strided_a)6764 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, n_div_4_strided_a) { 6765 TEST_REQUIRES_ARM_NEON_BF16; 6766 for (uint32_t n = 8; n <= 12; n += 4) { 6767 for (size_t k = 1; k <= 40; k += 9) { 6768 GemmMicrokernelTester() 6769 .mr(1) 6770 .nr(4) 6771 .kr(8) 6772 .sr(1) 6773 .m(1) 6774 .n(n) 6775 .k(k) 6776 .a_stride(43) 6777 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6778 } 6779 } 6780 } 6781 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,n_div_4_subtile)6782 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, n_div_4_subtile) { 6783 TEST_REQUIRES_ARM_NEON_BF16; 6784 for (uint32_t n = 8; n <= 12; n += 4) { 6785 for (size_t k = 1; k <= 40; k += 9) { 6786 for (uint32_t m = 1; m <= 1; m++) { 6787 GemmMicrokernelTester() 6788 .mr(1) 6789 .nr(4) 6790 .kr(8) 6791 .sr(1) 6792 .m(m) 6793 .n(n) 6794 .k(k) 6795 .iterations(1) 6796 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6797 } 6798 } 6799 } 6800 } 6801 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,strided_cm_subtile)6802 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, strided_cm_subtile) { 6803 TEST_REQUIRES_ARM_NEON_BF16; 6804 for (size_t k = 1; k <= 40; k += 9) { 6805 for (uint32_t n = 1; n <= 4; n++) { 6806 for (uint32_t m = 1; m <= 1; m++) { 6807 GemmMicrokernelTester() 6808 .mr(1) 6809 .nr(4) 6810 .kr(8) 6811 .sr(1) 6812 .m(m) 6813 .n(n) 6814 .k(k) 6815 .cm_stride(7) 6816 .iterations(1) 6817 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6818 } 6819 } 6820 } 6821 } 6822 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,qmin)6823 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, qmin) { 6824 TEST_REQUIRES_ARM_NEON_BF16; 6825 GemmMicrokernelTester() 6826 .mr(1) 6827 .nr(4) 6828 .kr(8) 6829 .sr(1) 6830 .m(1) 6831 .n(4) 6832 .k(8) 6833 .qmin(128) 6834 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6835 } 6836 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,qmax)6837 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, qmax) { 6838 TEST_REQUIRES_ARM_NEON_BF16; 6839 GemmMicrokernelTester() 6840 .mr(1) 6841 .nr(4) 6842 .kr(8) 6843 .sr(1) 6844 .m(1) 6845 .n(4) 6846 .k(8) 6847 .qmax(128) 6848 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6849 } 6850 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT,strided_cm)6851 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFDOT, strided_cm) { 6852 TEST_REQUIRES_ARM_NEON_BF16; 6853 GemmMicrokernelTester() 6854 .mr(1) 6855 .nr(4) 6856 .kr(8) 6857 .sr(1) 6858 .m(1) 6859 .n(4) 6860 .k(8) 6861 .cm_stride(7) 6862 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6863 } 6864 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 6865 6866 6867 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_eq_8)6868 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_eq_8) { 6869 TEST_REQUIRES_ARM_NEON_BF16; 6870 GemmMicrokernelTester() 6871 .mr(2) 6872 .nr(4) 6873 .kr(8) 6874 .sr(1) 6875 .m(2) 6876 .n(4) 6877 .k(8) 6878 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6879 } 6880 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,strided_cn)6881 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, strided_cn) { 6882 TEST_REQUIRES_ARM_NEON_BF16; 6883 GemmMicrokernelTester() 6884 .mr(2) 6885 .nr(4) 6886 .kr(8) 6887 .sr(1) 6888 .m(2) 6889 .n(4) 6890 .k(8) 6891 .cn_stride(7) 6892 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6893 } 6894 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_eq_8_strided_a)6895 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_eq_8_strided_a) { 6896 TEST_REQUIRES_ARM_NEON_BF16; 6897 GemmMicrokernelTester() 6898 .mr(2) 6899 .nr(4) 6900 .kr(8) 6901 .sr(1) 6902 .m(2) 6903 .n(4) 6904 .k(8) 6905 .a_stride(11) 6906 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6907 } 6908 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_eq_8_subtile)6909 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_eq_8_subtile) { 6910 TEST_REQUIRES_ARM_NEON_BF16; 6911 for (uint32_t n = 1; n <= 4; n++) { 6912 for (uint32_t m = 1; m <= 2; m++) { 6913 GemmMicrokernelTester() 6914 .mr(2) 6915 .nr(4) 6916 .kr(8) 6917 .sr(1) 6918 .m(m) 6919 .n(n) 6920 .k(8) 6921 .iterations(1) 6922 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6923 } 6924 } 6925 } 6926 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_eq_8_subtile_m)6927 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_eq_8_subtile_m) { 6928 TEST_REQUIRES_ARM_NEON_BF16; 6929 for (uint32_t m = 1; m <= 2; m++) { 6930 GemmMicrokernelTester() 6931 .mr(2) 6932 .nr(4) 6933 .kr(8) 6934 .sr(1) 6935 .m(m) 6936 .n(4) 6937 .k(8) 6938 .iterations(1) 6939 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6940 } 6941 } 6942 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_eq_8_subtile_n)6943 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_eq_8_subtile_n) { 6944 TEST_REQUIRES_ARM_NEON_BF16; 6945 for (uint32_t n = 1; n <= 4; n++) { 6946 GemmMicrokernelTester() 6947 .mr(2) 6948 .nr(4) 6949 .kr(8) 6950 .sr(1) 6951 .m(2) 6952 .n(n) 6953 .k(8) 6954 .iterations(1) 6955 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6956 } 6957 } 6958 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_lt_8)6959 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_lt_8) { 6960 TEST_REQUIRES_ARM_NEON_BF16; 6961 for (size_t k = 1; k < 8; k++) { 6962 GemmMicrokernelTester() 6963 .mr(2) 6964 .nr(4) 6965 .kr(8) 6966 .sr(1) 6967 .m(2) 6968 .n(4) 6969 .k(k) 6970 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6971 } 6972 } 6973 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_lt_8_strided_a)6974 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_lt_8_strided_a) { 6975 TEST_REQUIRES_ARM_NEON_BF16; 6976 for (size_t k = 1; k < 8; k++) { 6977 GemmMicrokernelTester() 6978 .mr(2) 6979 .nr(4) 6980 .kr(8) 6981 .sr(1) 6982 .m(2) 6983 .n(4) 6984 .k(k) 6985 .a_stride(11) 6986 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 6987 } 6988 } 6989 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_lt_8_subtile)6990 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_lt_8_subtile) { 6991 TEST_REQUIRES_ARM_NEON_BF16; 6992 for (size_t k = 1; k < 8; k++) { 6993 for (uint32_t n = 1; n <= 4; n++) { 6994 for (uint32_t m = 1; m <= 2; m++) { 6995 GemmMicrokernelTester() 6996 .mr(2) 6997 .nr(4) 6998 .kr(8) 6999 .sr(1) 7000 .m(m) 7001 .n(n) 7002 .k(k) 7003 .iterations(1) 7004 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7005 } 7006 } 7007 } 7008 } 7009 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_gt_8)7010 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_gt_8) { 7011 TEST_REQUIRES_ARM_NEON_BF16; 7012 for (size_t k = 9; k < 16; k++) { 7013 GemmMicrokernelTester() 7014 .mr(2) 7015 .nr(4) 7016 .kr(8) 7017 .sr(1) 7018 .m(2) 7019 .n(4) 7020 .k(k) 7021 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7022 } 7023 } 7024 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_gt_8_strided_a)7025 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_gt_8_strided_a) { 7026 TEST_REQUIRES_ARM_NEON_BF16; 7027 for (size_t k = 9; k < 16; k++) { 7028 GemmMicrokernelTester() 7029 .mr(2) 7030 .nr(4) 7031 .kr(8) 7032 .sr(1) 7033 .m(2) 7034 .n(4) 7035 .k(k) 7036 .a_stride(19) 7037 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7038 } 7039 } 7040 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_gt_8_subtile)7041 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_gt_8_subtile) { 7042 TEST_REQUIRES_ARM_NEON_BF16; 7043 for (size_t k = 9; k < 16; k++) { 7044 for (uint32_t n = 1; n <= 4; n++) { 7045 for (uint32_t m = 1; m <= 2; m++) { 7046 GemmMicrokernelTester() 7047 .mr(2) 7048 .nr(4) 7049 .kr(8) 7050 .sr(1) 7051 .m(m) 7052 .n(n) 7053 .k(k) 7054 .iterations(1) 7055 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7056 } 7057 } 7058 } 7059 } 7060 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_div_8)7061 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_div_8) { 7062 TEST_REQUIRES_ARM_NEON_BF16; 7063 for (size_t k = 16; k <= 80; k += 8) { 7064 GemmMicrokernelTester() 7065 .mr(2) 7066 .nr(4) 7067 .kr(8) 7068 .sr(1) 7069 .m(2) 7070 .n(4) 7071 .k(k) 7072 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7073 } 7074 } 7075 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_div_8_strided_a)7076 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_div_8_strided_a) { 7077 TEST_REQUIRES_ARM_NEON_BF16; 7078 for (size_t k = 16; k <= 80; k += 8) { 7079 GemmMicrokernelTester() 7080 .mr(2) 7081 .nr(4) 7082 .kr(8) 7083 .sr(1) 7084 .m(2) 7085 .n(4) 7086 .k(k) 7087 .a_stride(83) 7088 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7089 } 7090 } 7091 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,k_div_8_subtile)7092 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, k_div_8_subtile) { 7093 TEST_REQUIRES_ARM_NEON_BF16; 7094 for (size_t k = 16; k <= 80; k += 8) { 7095 for (uint32_t n = 1; n <= 4; n++) { 7096 for (uint32_t m = 1; m <= 2; m++) { 7097 GemmMicrokernelTester() 7098 .mr(2) 7099 .nr(4) 7100 .kr(8) 7101 .sr(1) 7102 .m(m) 7103 .n(n) 7104 .k(k) 7105 .iterations(1) 7106 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7107 } 7108 } 7109 } 7110 } 7111 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,n_gt_4)7112 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, n_gt_4) { 7113 TEST_REQUIRES_ARM_NEON_BF16; 7114 for (uint32_t n = 5; n < 8; n++) { 7115 for (size_t k = 1; k <= 40; k += 9) { 7116 GemmMicrokernelTester() 7117 .mr(2) 7118 .nr(4) 7119 .kr(8) 7120 .sr(1) 7121 .m(2) 7122 .n(n) 7123 .k(k) 7124 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7125 } 7126 } 7127 } 7128 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,n_gt_4_strided_cn)7129 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, n_gt_4_strided_cn) { 7130 TEST_REQUIRES_ARM_NEON_BF16; 7131 for (uint32_t n = 5; n < 8; n++) { 7132 for (size_t k = 1; k <= 40; k += 9) { 7133 GemmMicrokernelTester() 7134 .mr(2) 7135 .nr(4) 7136 .kr(8) 7137 .sr(1) 7138 .m(2) 7139 .n(n) 7140 .k(k) 7141 .cn_stride(7) 7142 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7143 } 7144 } 7145 } 7146 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,n_gt_4_strided_a)7147 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, n_gt_4_strided_a) { 7148 TEST_REQUIRES_ARM_NEON_BF16; 7149 for (uint32_t n = 5; n < 8; n++) { 7150 for (size_t k = 1; k <= 40; k += 9) { 7151 GemmMicrokernelTester() 7152 .mr(2) 7153 .nr(4) 7154 .kr(8) 7155 .sr(1) 7156 .m(2) 7157 .n(n) 7158 .k(k) 7159 .a_stride(43) 7160 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7161 } 7162 } 7163 } 7164 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,n_gt_4_subtile)7165 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, n_gt_4_subtile) { 7166 TEST_REQUIRES_ARM_NEON_BF16; 7167 for (uint32_t n = 5; n < 8; n++) { 7168 for (size_t k = 1; k <= 40; k += 9) { 7169 for (uint32_t m = 1; m <= 2; m++) { 7170 GemmMicrokernelTester() 7171 .mr(2) 7172 .nr(4) 7173 .kr(8) 7174 .sr(1) 7175 .m(m) 7176 .n(n) 7177 .k(k) 7178 .iterations(1) 7179 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7180 } 7181 } 7182 } 7183 } 7184 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,n_div_4)7185 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, n_div_4) { 7186 TEST_REQUIRES_ARM_NEON_BF16; 7187 for (uint32_t n = 8; n <= 12; n += 4) { 7188 for (size_t k = 1; k <= 40; k += 9) { 7189 GemmMicrokernelTester() 7190 .mr(2) 7191 .nr(4) 7192 .kr(8) 7193 .sr(1) 7194 .m(2) 7195 .n(n) 7196 .k(k) 7197 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7198 } 7199 } 7200 } 7201 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,n_div_4_strided_cn)7202 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, n_div_4_strided_cn) { 7203 TEST_REQUIRES_ARM_NEON_BF16; 7204 for (uint32_t n = 8; n <= 12; n += 4) { 7205 for (size_t k = 1; k <= 40; k += 9) { 7206 GemmMicrokernelTester() 7207 .mr(2) 7208 .nr(4) 7209 .kr(8) 7210 .sr(1) 7211 .m(2) 7212 .n(n) 7213 .k(k) 7214 .cn_stride(7) 7215 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7216 } 7217 } 7218 } 7219 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,n_div_4_strided_a)7220 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, n_div_4_strided_a) { 7221 TEST_REQUIRES_ARM_NEON_BF16; 7222 for (uint32_t n = 8; n <= 12; n += 4) { 7223 for (size_t k = 1; k <= 40; k += 9) { 7224 GemmMicrokernelTester() 7225 .mr(2) 7226 .nr(4) 7227 .kr(8) 7228 .sr(1) 7229 .m(2) 7230 .n(n) 7231 .k(k) 7232 .a_stride(43) 7233 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7234 } 7235 } 7236 } 7237 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,n_div_4_subtile)7238 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, n_div_4_subtile) { 7239 TEST_REQUIRES_ARM_NEON_BF16; 7240 for (uint32_t n = 8; n <= 12; n += 4) { 7241 for (size_t k = 1; k <= 40; k += 9) { 7242 for (uint32_t m = 1; m <= 2; m++) { 7243 GemmMicrokernelTester() 7244 .mr(2) 7245 .nr(4) 7246 .kr(8) 7247 .sr(1) 7248 .m(m) 7249 .n(n) 7250 .k(k) 7251 .iterations(1) 7252 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7253 } 7254 } 7255 } 7256 } 7257 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,strided_cm_subtile)7258 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, strided_cm_subtile) { 7259 TEST_REQUIRES_ARM_NEON_BF16; 7260 for (size_t k = 1; k <= 40; k += 9) { 7261 for (uint32_t n = 1; n <= 4; n++) { 7262 for (uint32_t m = 1; m <= 2; m++) { 7263 GemmMicrokernelTester() 7264 .mr(2) 7265 .nr(4) 7266 .kr(8) 7267 .sr(1) 7268 .m(m) 7269 .n(n) 7270 .k(k) 7271 .cm_stride(7) 7272 .iterations(1) 7273 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7274 } 7275 } 7276 } 7277 } 7278 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,qmin)7279 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, qmin) { 7280 TEST_REQUIRES_ARM_NEON_BF16; 7281 GemmMicrokernelTester() 7282 .mr(2) 7283 .nr(4) 7284 .kr(8) 7285 .sr(1) 7286 .m(2) 7287 .n(4) 7288 .k(8) 7289 .qmin(128) 7290 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7291 } 7292 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,qmax)7293 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, qmax) { 7294 TEST_REQUIRES_ARM_NEON_BF16; 7295 GemmMicrokernelTester() 7296 .mr(2) 7297 .nr(4) 7298 .kr(8) 7299 .sr(1) 7300 .m(2) 7301 .n(4) 7302 .k(8) 7303 .qmax(128) 7304 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7305 } 7306 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT,strided_cm)7307 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFDOT, strided_cm) { 7308 TEST_REQUIRES_ARM_NEON_BF16; 7309 GemmMicrokernelTester() 7310 .mr(2) 7311 .nr(4) 7312 .kr(8) 7313 .sr(1) 7314 .m(2) 7315 .n(4) 7316 .k(8) 7317 .cm_stride(7) 7318 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7319 } 7320 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 7321 7322 7323 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_eq_8)7324 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_eq_8) { 7325 TEST_REQUIRES_ARM_NEON_BF16; 7326 GemmMicrokernelTester() 7327 .mr(3) 7328 .nr(4) 7329 .kr(8) 7330 .sr(1) 7331 .m(3) 7332 .n(4) 7333 .k(8) 7334 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7335 } 7336 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,strided_cn)7337 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, strided_cn) { 7338 TEST_REQUIRES_ARM_NEON_BF16; 7339 GemmMicrokernelTester() 7340 .mr(3) 7341 .nr(4) 7342 .kr(8) 7343 .sr(1) 7344 .m(3) 7345 .n(4) 7346 .k(8) 7347 .cn_stride(7) 7348 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7349 } 7350 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_eq_8_strided_a)7351 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_eq_8_strided_a) { 7352 TEST_REQUIRES_ARM_NEON_BF16; 7353 GemmMicrokernelTester() 7354 .mr(3) 7355 .nr(4) 7356 .kr(8) 7357 .sr(1) 7358 .m(3) 7359 .n(4) 7360 .k(8) 7361 .a_stride(11) 7362 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7363 } 7364 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_eq_8_subtile)7365 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_eq_8_subtile) { 7366 TEST_REQUIRES_ARM_NEON_BF16; 7367 for (uint32_t n = 1; n <= 4; n++) { 7368 for (uint32_t m = 1; m <= 3; m++) { 7369 GemmMicrokernelTester() 7370 .mr(3) 7371 .nr(4) 7372 .kr(8) 7373 .sr(1) 7374 .m(m) 7375 .n(n) 7376 .k(8) 7377 .iterations(1) 7378 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7379 } 7380 } 7381 } 7382 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_eq_8_subtile_m)7383 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_eq_8_subtile_m) { 7384 TEST_REQUIRES_ARM_NEON_BF16; 7385 for (uint32_t m = 1; m <= 3; m++) { 7386 GemmMicrokernelTester() 7387 .mr(3) 7388 .nr(4) 7389 .kr(8) 7390 .sr(1) 7391 .m(m) 7392 .n(4) 7393 .k(8) 7394 .iterations(1) 7395 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7396 } 7397 } 7398 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_eq_8_subtile_n)7399 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_eq_8_subtile_n) { 7400 TEST_REQUIRES_ARM_NEON_BF16; 7401 for (uint32_t n = 1; n <= 4; n++) { 7402 GemmMicrokernelTester() 7403 .mr(3) 7404 .nr(4) 7405 .kr(8) 7406 .sr(1) 7407 .m(3) 7408 .n(n) 7409 .k(8) 7410 .iterations(1) 7411 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7412 } 7413 } 7414 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_lt_8)7415 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_lt_8) { 7416 TEST_REQUIRES_ARM_NEON_BF16; 7417 for (size_t k = 1; k < 8; k++) { 7418 GemmMicrokernelTester() 7419 .mr(3) 7420 .nr(4) 7421 .kr(8) 7422 .sr(1) 7423 .m(3) 7424 .n(4) 7425 .k(k) 7426 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7427 } 7428 } 7429 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_lt_8_strided_a)7430 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_lt_8_strided_a) { 7431 TEST_REQUIRES_ARM_NEON_BF16; 7432 for (size_t k = 1; k < 8; k++) { 7433 GemmMicrokernelTester() 7434 .mr(3) 7435 .nr(4) 7436 .kr(8) 7437 .sr(1) 7438 .m(3) 7439 .n(4) 7440 .k(k) 7441 .a_stride(11) 7442 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7443 } 7444 } 7445 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_lt_8_subtile)7446 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_lt_8_subtile) { 7447 TEST_REQUIRES_ARM_NEON_BF16; 7448 for (size_t k = 1; k < 8; k++) { 7449 for (uint32_t n = 1; n <= 4; n++) { 7450 for (uint32_t m = 1; m <= 3; m++) { 7451 GemmMicrokernelTester() 7452 .mr(3) 7453 .nr(4) 7454 .kr(8) 7455 .sr(1) 7456 .m(m) 7457 .n(n) 7458 .k(k) 7459 .iterations(1) 7460 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7461 } 7462 } 7463 } 7464 } 7465 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_gt_8)7466 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_gt_8) { 7467 TEST_REQUIRES_ARM_NEON_BF16; 7468 for (size_t k = 9; k < 16; k++) { 7469 GemmMicrokernelTester() 7470 .mr(3) 7471 .nr(4) 7472 .kr(8) 7473 .sr(1) 7474 .m(3) 7475 .n(4) 7476 .k(k) 7477 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7478 } 7479 } 7480 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_gt_8_strided_a)7481 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_gt_8_strided_a) { 7482 TEST_REQUIRES_ARM_NEON_BF16; 7483 for (size_t k = 9; k < 16; k++) { 7484 GemmMicrokernelTester() 7485 .mr(3) 7486 .nr(4) 7487 .kr(8) 7488 .sr(1) 7489 .m(3) 7490 .n(4) 7491 .k(k) 7492 .a_stride(19) 7493 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7494 } 7495 } 7496 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_gt_8_subtile)7497 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_gt_8_subtile) { 7498 TEST_REQUIRES_ARM_NEON_BF16; 7499 for (size_t k = 9; k < 16; k++) { 7500 for (uint32_t n = 1; n <= 4; n++) { 7501 for (uint32_t m = 1; m <= 3; m++) { 7502 GemmMicrokernelTester() 7503 .mr(3) 7504 .nr(4) 7505 .kr(8) 7506 .sr(1) 7507 .m(m) 7508 .n(n) 7509 .k(k) 7510 .iterations(1) 7511 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7512 } 7513 } 7514 } 7515 } 7516 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_div_8)7517 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_div_8) { 7518 TEST_REQUIRES_ARM_NEON_BF16; 7519 for (size_t k = 16; k <= 80; k += 8) { 7520 GemmMicrokernelTester() 7521 .mr(3) 7522 .nr(4) 7523 .kr(8) 7524 .sr(1) 7525 .m(3) 7526 .n(4) 7527 .k(k) 7528 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7529 } 7530 } 7531 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_div_8_strided_a)7532 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_div_8_strided_a) { 7533 TEST_REQUIRES_ARM_NEON_BF16; 7534 for (size_t k = 16; k <= 80; k += 8) { 7535 GemmMicrokernelTester() 7536 .mr(3) 7537 .nr(4) 7538 .kr(8) 7539 .sr(1) 7540 .m(3) 7541 .n(4) 7542 .k(k) 7543 .a_stride(83) 7544 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7545 } 7546 } 7547 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,k_div_8_subtile)7548 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, k_div_8_subtile) { 7549 TEST_REQUIRES_ARM_NEON_BF16; 7550 for (size_t k = 16; k <= 80; k += 8) { 7551 for (uint32_t n = 1; n <= 4; n++) { 7552 for (uint32_t m = 1; m <= 3; m++) { 7553 GemmMicrokernelTester() 7554 .mr(3) 7555 .nr(4) 7556 .kr(8) 7557 .sr(1) 7558 .m(m) 7559 .n(n) 7560 .k(k) 7561 .iterations(1) 7562 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7563 } 7564 } 7565 } 7566 } 7567 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,n_gt_4)7568 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, n_gt_4) { 7569 TEST_REQUIRES_ARM_NEON_BF16; 7570 for (uint32_t n = 5; n < 8; n++) { 7571 for (size_t k = 1; k <= 40; k += 9) { 7572 GemmMicrokernelTester() 7573 .mr(3) 7574 .nr(4) 7575 .kr(8) 7576 .sr(1) 7577 .m(3) 7578 .n(n) 7579 .k(k) 7580 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7581 } 7582 } 7583 } 7584 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,n_gt_4_strided_cn)7585 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, n_gt_4_strided_cn) { 7586 TEST_REQUIRES_ARM_NEON_BF16; 7587 for (uint32_t n = 5; n < 8; n++) { 7588 for (size_t k = 1; k <= 40; k += 9) { 7589 GemmMicrokernelTester() 7590 .mr(3) 7591 .nr(4) 7592 .kr(8) 7593 .sr(1) 7594 .m(3) 7595 .n(n) 7596 .k(k) 7597 .cn_stride(7) 7598 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7599 } 7600 } 7601 } 7602 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,n_gt_4_strided_a)7603 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, n_gt_4_strided_a) { 7604 TEST_REQUIRES_ARM_NEON_BF16; 7605 for (uint32_t n = 5; n < 8; n++) { 7606 for (size_t k = 1; k <= 40; k += 9) { 7607 GemmMicrokernelTester() 7608 .mr(3) 7609 .nr(4) 7610 .kr(8) 7611 .sr(1) 7612 .m(3) 7613 .n(n) 7614 .k(k) 7615 .a_stride(43) 7616 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7617 } 7618 } 7619 } 7620 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,n_gt_4_subtile)7621 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, n_gt_4_subtile) { 7622 TEST_REQUIRES_ARM_NEON_BF16; 7623 for (uint32_t n = 5; n < 8; n++) { 7624 for (size_t k = 1; k <= 40; k += 9) { 7625 for (uint32_t m = 1; m <= 3; m++) { 7626 GemmMicrokernelTester() 7627 .mr(3) 7628 .nr(4) 7629 .kr(8) 7630 .sr(1) 7631 .m(m) 7632 .n(n) 7633 .k(k) 7634 .iterations(1) 7635 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7636 } 7637 } 7638 } 7639 } 7640 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,n_div_4)7641 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, n_div_4) { 7642 TEST_REQUIRES_ARM_NEON_BF16; 7643 for (uint32_t n = 8; n <= 12; n += 4) { 7644 for (size_t k = 1; k <= 40; k += 9) { 7645 GemmMicrokernelTester() 7646 .mr(3) 7647 .nr(4) 7648 .kr(8) 7649 .sr(1) 7650 .m(3) 7651 .n(n) 7652 .k(k) 7653 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7654 } 7655 } 7656 } 7657 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,n_div_4_strided_cn)7658 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, n_div_4_strided_cn) { 7659 TEST_REQUIRES_ARM_NEON_BF16; 7660 for (uint32_t n = 8; n <= 12; n += 4) { 7661 for (size_t k = 1; k <= 40; k += 9) { 7662 GemmMicrokernelTester() 7663 .mr(3) 7664 .nr(4) 7665 .kr(8) 7666 .sr(1) 7667 .m(3) 7668 .n(n) 7669 .k(k) 7670 .cn_stride(7) 7671 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7672 } 7673 } 7674 } 7675 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,n_div_4_strided_a)7676 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, n_div_4_strided_a) { 7677 TEST_REQUIRES_ARM_NEON_BF16; 7678 for (uint32_t n = 8; n <= 12; n += 4) { 7679 for (size_t k = 1; k <= 40; k += 9) { 7680 GemmMicrokernelTester() 7681 .mr(3) 7682 .nr(4) 7683 .kr(8) 7684 .sr(1) 7685 .m(3) 7686 .n(n) 7687 .k(k) 7688 .a_stride(43) 7689 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7690 } 7691 } 7692 } 7693 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,n_div_4_subtile)7694 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, n_div_4_subtile) { 7695 TEST_REQUIRES_ARM_NEON_BF16; 7696 for (uint32_t n = 8; n <= 12; n += 4) { 7697 for (size_t k = 1; k <= 40; k += 9) { 7698 for (uint32_t m = 1; m <= 3; m++) { 7699 GemmMicrokernelTester() 7700 .mr(3) 7701 .nr(4) 7702 .kr(8) 7703 .sr(1) 7704 .m(m) 7705 .n(n) 7706 .k(k) 7707 .iterations(1) 7708 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7709 } 7710 } 7711 } 7712 } 7713 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,strided_cm_subtile)7714 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, strided_cm_subtile) { 7715 TEST_REQUIRES_ARM_NEON_BF16; 7716 for (size_t k = 1; k <= 40; k += 9) { 7717 for (uint32_t n = 1; n <= 4; n++) { 7718 for (uint32_t m = 1; m <= 3; m++) { 7719 GemmMicrokernelTester() 7720 .mr(3) 7721 .nr(4) 7722 .kr(8) 7723 .sr(1) 7724 .m(m) 7725 .n(n) 7726 .k(k) 7727 .cm_stride(7) 7728 .iterations(1) 7729 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7730 } 7731 } 7732 } 7733 } 7734 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,qmin)7735 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, qmin) { 7736 TEST_REQUIRES_ARM_NEON_BF16; 7737 GemmMicrokernelTester() 7738 .mr(3) 7739 .nr(4) 7740 .kr(8) 7741 .sr(1) 7742 .m(3) 7743 .n(4) 7744 .k(8) 7745 .qmin(128) 7746 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7747 } 7748 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,qmax)7749 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, qmax) { 7750 TEST_REQUIRES_ARM_NEON_BF16; 7751 GemmMicrokernelTester() 7752 .mr(3) 7753 .nr(4) 7754 .kr(8) 7755 .sr(1) 7756 .m(3) 7757 .n(4) 7758 .k(8) 7759 .qmax(128) 7760 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7761 } 7762 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT,strided_cm)7763 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFDOT, strided_cm) { 7764 TEST_REQUIRES_ARM_NEON_BF16; 7765 GemmMicrokernelTester() 7766 .mr(3) 7767 .nr(4) 7768 .kr(8) 7769 .sr(1) 7770 .m(3) 7771 .n(4) 7772 .k(8) 7773 .cm_stride(7) 7774 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7775 } 7776 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 7777 7778 7779 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_eq_8)7780 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_eq_8) { 7781 TEST_REQUIRES_ARM_NEON_BF16; 7782 GemmMicrokernelTester() 7783 .mr(4) 7784 .nr(4) 7785 .kr(8) 7786 .sr(1) 7787 .m(4) 7788 .n(4) 7789 .k(8) 7790 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7791 } 7792 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,strided_cn)7793 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, strided_cn) { 7794 TEST_REQUIRES_ARM_NEON_BF16; 7795 GemmMicrokernelTester() 7796 .mr(4) 7797 .nr(4) 7798 .kr(8) 7799 .sr(1) 7800 .m(4) 7801 .n(4) 7802 .k(8) 7803 .cn_stride(7) 7804 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7805 } 7806 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_eq_8_strided_a)7807 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_eq_8_strided_a) { 7808 TEST_REQUIRES_ARM_NEON_BF16; 7809 GemmMicrokernelTester() 7810 .mr(4) 7811 .nr(4) 7812 .kr(8) 7813 .sr(1) 7814 .m(4) 7815 .n(4) 7816 .k(8) 7817 .a_stride(11) 7818 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7819 } 7820 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_eq_8_subtile)7821 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_eq_8_subtile) { 7822 TEST_REQUIRES_ARM_NEON_BF16; 7823 for (uint32_t n = 1; n <= 4; n++) { 7824 for (uint32_t m = 1; m <= 4; m++) { 7825 GemmMicrokernelTester() 7826 .mr(4) 7827 .nr(4) 7828 .kr(8) 7829 .sr(1) 7830 .m(m) 7831 .n(n) 7832 .k(8) 7833 .iterations(1) 7834 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7835 } 7836 } 7837 } 7838 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_eq_8_subtile_m)7839 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_eq_8_subtile_m) { 7840 TEST_REQUIRES_ARM_NEON_BF16; 7841 for (uint32_t m = 1; m <= 4; m++) { 7842 GemmMicrokernelTester() 7843 .mr(4) 7844 .nr(4) 7845 .kr(8) 7846 .sr(1) 7847 .m(m) 7848 .n(4) 7849 .k(8) 7850 .iterations(1) 7851 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7852 } 7853 } 7854 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_eq_8_subtile_n)7855 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_eq_8_subtile_n) { 7856 TEST_REQUIRES_ARM_NEON_BF16; 7857 for (uint32_t n = 1; n <= 4; n++) { 7858 GemmMicrokernelTester() 7859 .mr(4) 7860 .nr(4) 7861 .kr(8) 7862 .sr(1) 7863 .m(4) 7864 .n(n) 7865 .k(8) 7866 .iterations(1) 7867 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7868 } 7869 } 7870 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_lt_8)7871 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_lt_8) { 7872 TEST_REQUIRES_ARM_NEON_BF16; 7873 for (size_t k = 1; k < 8; k++) { 7874 GemmMicrokernelTester() 7875 .mr(4) 7876 .nr(4) 7877 .kr(8) 7878 .sr(1) 7879 .m(4) 7880 .n(4) 7881 .k(k) 7882 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7883 } 7884 } 7885 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_lt_8_strided_a)7886 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_lt_8_strided_a) { 7887 TEST_REQUIRES_ARM_NEON_BF16; 7888 for (size_t k = 1; k < 8; k++) { 7889 GemmMicrokernelTester() 7890 .mr(4) 7891 .nr(4) 7892 .kr(8) 7893 .sr(1) 7894 .m(4) 7895 .n(4) 7896 .k(k) 7897 .a_stride(11) 7898 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7899 } 7900 } 7901 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_lt_8_subtile)7902 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_lt_8_subtile) { 7903 TEST_REQUIRES_ARM_NEON_BF16; 7904 for (size_t k = 1; k < 8; k++) { 7905 for (uint32_t n = 1; n <= 4; n++) { 7906 for (uint32_t m = 1; m <= 4; m++) { 7907 GemmMicrokernelTester() 7908 .mr(4) 7909 .nr(4) 7910 .kr(8) 7911 .sr(1) 7912 .m(m) 7913 .n(n) 7914 .k(k) 7915 .iterations(1) 7916 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7917 } 7918 } 7919 } 7920 } 7921 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_gt_8)7922 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_gt_8) { 7923 TEST_REQUIRES_ARM_NEON_BF16; 7924 for (size_t k = 9; k < 16; k++) { 7925 GemmMicrokernelTester() 7926 .mr(4) 7927 .nr(4) 7928 .kr(8) 7929 .sr(1) 7930 .m(4) 7931 .n(4) 7932 .k(k) 7933 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7934 } 7935 } 7936 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_gt_8_strided_a)7937 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_gt_8_strided_a) { 7938 TEST_REQUIRES_ARM_NEON_BF16; 7939 for (size_t k = 9; k < 16; k++) { 7940 GemmMicrokernelTester() 7941 .mr(4) 7942 .nr(4) 7943 .kr(8) 7944 .sr(1) 7945 .m(4) 7946 .n(4) 7947 .k(k) 7948 .a_stride(19) 7949 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7950 } 7951 } 7952 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_gt_8_subtile)7953 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_gt_8_subtile) { 7954 TEST_REQUIRES_ARM_NEON_BF16; 7955 for (size_t k = 9; k < 16; k++) { 7956 for (uint32_t n = 1; n <= 4; n++) { 7957 for (uint32_t m = 1; m <= 4; m++) { 7958 GemmMicrokernelTester() 7959 .mr(4) 7960 .nr(4) 7961 .kr(8) 7962 .sr(1) 7963 .m(m) 7964 .n(n) 7965 .k(k) 7966 .iterations(1) 7967 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7968 } 7969 } 7970 } 7971 } 7972 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_div_8)7973 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_div_8) { 7974 TEST_REQUIRES_ARM_NEON_BF16; 7975 for (size_t k = 16; k <= 80; k += 8) { 7976 GemmMicrokernelTester() 7977 .mr(4) 7978 .nr(4) 7979 .kr(8) 7980 .sr(1) 7981 .m(4) 7982 .n(4) 7983 .k(k) 7984 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 7985 } 7986 } 7987 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_div_8_strided_a)7988 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_div_8_strided_a) { 7989 TEST_REQUIRES_ARM_NEON_BF16; 7990 for (size_t k = 16; k <= 80; k += 8) { 7991 GemmMicrokernelTester() 7992 .mr(4) 7993 .nr(4) 7994 .kr(8) 7995 .sr(1) 7996 .m(4) 7997 .n(4) 7998 .k(k) 7999 .a_stride(83) 8000 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8001 } 8002 } 8003 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,k_div_8_subtile)8004 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, k_div_8_subtile) { 8005 TEST_REQUIRES_ARM_NEON_BF16; 8006 for (size_t k = 16; k <= 80; k += 8) { 8007 for (uint32_t n = 1; n <= 4; n++) { 8008 for (uint32_t m = 1; m <= 4; m++) { 8009 GemmMicrokernelTester() 8010 .mr(4) 8011 .nr(4) 8012 .kr(8) 8013 .sr(1) 8014 .m(m) 8015 .n(n) 8016 .k(k) 8017 .iterations(1) 8018 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8019 } 8020 } 8021 } 8022 } 8023 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,n_gt_4)8024 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, n_gt_4) { 8025 TEST_REQUIRES_ARM_NEON_BF16; 8026 for (uint32_t n = 5; n < 8; n++) { 8027 for (size_t k = 1; k <= 40; k += 9) { 8028 GemmMicrokernelTester() 8029 .mr(4) 8030 .nr(4) 8031 .kr(8) 8032 .sr(1) 8033 .m(4) 8034 .n(n) 8035 .k(k) 8036 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8037 } 8038 } 8039 } 8040 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,n_gt_4_strided_cn)8041 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, n_gt_4_strided_cn) { 8042 TEST_REQUIRES_ARM_NEON_BF16; 8043 for (uint32_t n = 5; n < 8; n++) { 8044 for (size_t k = 1; k <= 40; k += 9) { 8045 GemmMicrokernelTester() 8046 .mr(4) 8047 .nr(4) 8048 .kr(8) 8049 .sr(1) 8050 .m(4) 8051 .n(n) 8052 .k(k) 8053 .cn_stride(7) 8054 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8055 } 8056 } 8057 } 8058 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,n_gt_4_strided_a)8059 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, n_gt_4_strided_a) { 8060 TEST_REQUIRES_ARM_NEON_BF16; 8061 for (uint32_t n = 5; n < 8; n++) { 8062 for (size_t k = 1; k <= 40; k += 9) { 8063 GemmMicrokernelTester() 8064 .mr(4) 8065 .nr(4) 8066 .kr(8) 8067 .sr(1) 8068 .m(4) 8069 .n(n) 8070 .k(k) 8071 .a_stride(43) 8072 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8073 } 8074 } 8075 } 8076 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,n_gt_4_subtile)8077 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, n_gt_4_subtile) { 8078 TEST_REQUIRES_ARM_NEON_BF16; 8079 for (uint32_t n = 5; n < 8; n++) { 8080 for (size_t k = 1; k <= 40; k += 9) { 8081 for (uint32_t m = 1; m <= 4; m++) { 8082 GemmMicrokernelTester() 8083 .mr(4) 8084 .nr(4) 8085 .kr(8) 8086 .sr(1) 8087 .m(m) 8088 .n(n) 8089 .k(k) 8090 .iterations(1) 8091 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8092 } 8093 } 8094 } 8095 } 8096 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,n_div_4)8097 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, n_div_4) { 8098 TEST_REQUIRES_ARM_NEON_BF16; 8099 for (uint32_t n = 8; n <= 12; n += 4) { 8100 for (size_t k = 1; k <= 40; k += 9) { 8101 GemmMicrokernelTester() 8102 .mr(4) 8103 .nr(4) 8104 .kr(8) 8105 .sr(1) 8106 .m(4) 8107 .n(n) 8108 .k(k) 8109 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8110 } 8111 } 8112 } 8113 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,n_div_4_strided_cn)8114 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, n_div_4_strided_cn) { 8115 TEST_REQUIRES_ARM_NEON_BF16; 8116 for (uint32_t n = 8; n <= 12; n += 4) { 8117 for (size_t k = 1; k <= 40; k += 9) { 8118 GemmMicrokernelTester() 8119 .mr(4) 8120 .nr(4) 8121 .kr(8) 8122 .sr(1) 8123 .m(4) 8124 .n(n) 8125 .k(k) 8126 .cn_stride(7) 8127 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8128 } 8129 } 8130 } 8131 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,n_div_4_strided_a)8132 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, n_div_4_strided_a) { 8133 TEST_REQUIRES_ARM_NEON_BF16; 8134 for (uint32_t n = 8; n <= 12; n += 4) { 8135 for (size_t k = 1; k <= 40; k += 9) { 8136 GemmMicrokernelTester() 8137 .mr(4) 8138 .nr(4) 8139 .kr(8) 8140 .sr(1) 8141 .m(4) 8142 .n(n) 8143 .k(k) 8144 .a_stride(43) 8145 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8146 } 8147 } 8148 } 8149 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,n_div_4_subtile)8150 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, n_div_4_subtile) { 8151 TEST_REQUIRES_ARM_NEON_BF16; 8152 for (uint32_t n = 8; n <= 12; n += 4) { 8153 for (size_t k = 1; k <= 40; k += 9) { 8154 for (uint32_t m = 1; m <= 4; m++) { 8155 GemmMicrokernelTester() 8156 .mr(4) 8157 .nr(4) 8158 .kr(8) 8159 .sr(1) 8160 .m(m) 8161 .n(n) 8162 .k(k) 8163 .iterations(1) 8164 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8165 } 8166 } 8167 } 8168 } 8169 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,strided_cm_subtile)8170 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, strided_cm_subtile) { 8171 TEST_REQUIRES_ARM_NEON_BF16; 8172 for (size_t k = 1; k <= 40; k += 9) { 8173 for (uint32_t n = 1; n <= 4; n++) { 8174 for (uint32_t m = 1; m <= 4; m++) { 8175 GemmMicrokernelTester() 8176 .mr(4) 8177 .nr(4) 8178 .kr(8) 8179 .sr(1) 8180 .m(m) 8181 .n(n) 8182 .k(k) 8183 .cm_stride(7) 8184 .iterations(1) 8185 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8186 } 8187 } 8188 } 8189 } 8190 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,qmin)8191 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, qmin) { 8192 TEST_REQUIRES_ARM_NEON_BF16; 8193 GemmMicrokernelTester() 8194 .mr(4) 8195 .nr(4) 8196 .kr(8) 8197 .sr(1) 8198 .m(4) 8199 .n(4) 8200 .k(8) 8201 .qmin(128) 8202 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8203 } 8204 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,qmax)8205 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, qmax) { 8206 TEST_REQUIRES_ARM_NEON_BF16; 8207 GemmMicrokernelTester() 8208 .mr(4) 8209 .nr(4) 8210 .kr(8) 8211 .sr(1) 8212 .m(4) 8213 .n(4) 8214 .k(8) 8215 .qmax(128) 8216 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8217 } 8218 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT,strided_cm)8219 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFDOT, strided_cm) { 8220 TEST_REQUIRES_ARM_NEON_BF16; 8221 GemmMicrokernelTester() 8222 .mr(4) 8223 .nr(4) 8224 .kr(8) 8225 .sr(1) 8226 .m(4) 8227 .n(4) 8228 .k(8) 8229 .cm_stride(7) 8230 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8231 } 8232 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 8233 8234 8235 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_eq_8)8236 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_eq_8) { 8237 TEST_REQUIRES_ARM_NEON_BF16; 8238 GemmMicrokernelTester() 8239 .mr(5) 8240 .nr(4) 8241 .kr(8) 8242 .sr(1) 8243 .m(5) 8244 .n(4) 8245 .k(8) 8246 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8247 } 8248 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,strided_cn)8249 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, strided_cn) { 8250 TEST_REQUIRES_ARM_NEON_BF16; 8251 GemmMicrokernelTester() 8252 .mr(5) 8253 .nr(4) 8254 .kr(8) 8255 .sr(1) 8256 .m(5) 8257 .n(4) 8258 .k(8) 8259 .cn_stride(7) 8260 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8261 } 8262 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_eq_8_strided_a)8263 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_eq_8_strided_a) { 8264 TEST_REQUIRES_ARM_NEON_BF16; 8265 GemmMicrokernelTester() 8266 .mr(5) 8267 .nr(4) 8268 .kr(8) 8269 .sr(1) 8270 .m(5) 8271 .n(4) 8272 .k(8) 8273 .a_stride(11) 8274 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8275 } 8276 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_eq_8_subtile)8277 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_eq_8_subtile) { 8278 TEST_REQUIRES_ARM_NEON_BF16; 8279 for (uint32_t n = 1; n <= 4; n++) { 8280 for (uint32_t m = 1; m <= 5; m++) { 8281 GemmMicrokernelTester() 8282 .mr(5) 8283 .nr(4) 8284 .kr(8) 8285 .sr(1) 8286 .m(m) 8287 .n(n) 8288 .k(8) 8289 .iterations(1) 8290 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8291 } 8292 } 8293 } 8294 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_eq_8_subtile_m)8295 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_eq_8_subtile_m) { 8296 TEST_REQUIRES_ARM_NEON_BF16; 8297 for (uint32_t m = 1; m <= 5; m++) { 8298 GemmMicrokernelTester() 8299 .mr(5) 8300 .nr(4) 8301 .kr(8) 8302 .sr(1) 8303 .m(m) 8304 .n(4) 8305 .k(8) 8306 .iterations(1) 8307 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8308 } 8309 } 8310 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_eq_8_subtile_n)8311 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_eq_8_subtile_n) { 8312 TEST_REQUIRES_ARM_NEON_BF16; 8313 for (uint32_t n = 1; n <= 4; n++) { 8314 GemmMicrokernelTester() 8315 .mr(5) 8316 .nr(4) 8317 .kr(8) 8318 .sr(1) 8319 .m(5) 8320 .n(n) 8321 .k(8) 8322 .iterations(1) 8323 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8324 } 8325 } 8326 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_lt_8)8327 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_lt_8) { 8328 TEST_REQUIRES_ARM_NEON_BF16; 8329 for (size_t k = 1; k < 8; k++) { 8330 GemmMicrokernelTester() 8331 .mr(5) 8332 .nr(4) 8333 .kr(8) 8334 .sr(1) 8335 .m(5) 8336 .n(4) 8337 .k(k) 8338 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8339 } 8340 } 8341 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_lt_8_strided_a)8342 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_lt_8_strided_a) { 8343 TEST_REQUIRES_ARM_NEON_BF16; 8344 for (size_t k = 1; k < 8; k++) { 8345 GemmMicrokernelTester() 8346 .mr(5) 8347 .nr(4) 8348 .kr(8) 8349 .sr(1) 8350 .m(5) 8351 .n(4) 8352 .k(k) 8353 .a_stride(11) 8354 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8355 } 8356 } 8357 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_lt_8_subtile)8358 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_lt_8_subtile) { 8359 TEST_REQUIRES_ARM_NEON_BF16; 8360 for (size_t k = 1; k < 8; k++) { 8361 for (uint32_t n = 1; n <= 4; n++) { 8362 for (uint32_t m = 1; m <= 5; m++) { 8363 GemmMicrokernelTester() 8364 .mr(5) 8365 .nr(4) 8366 .kr(8) 8367 .sr(1) 8368 .m(m) 8369 .n(n) 8370 .k(k) 8371 .iterations(1) 8372 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8373 } 8374 } 8375 } 8376 } 8377 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_gt_8)8378 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_gt_8) { 8379 TEST_REQUIRES_ARM_NEON_BF16; 8380 for (size_t k = 9; k < 16; k++) { 8381 GemmMicrokernelTester() 8382 .mr(5) 8383 .nr(4) 8384 .kr(8) 8385 .sr(1) 8386 .m(5) 8387 .n(4) 8388 .k(k) 8389 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8390 } 8391 } 8392 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_gt_8_strided_a)8393 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_gt_8_strided_a) { 8394 TEST_REQUIRES_ARM_NEON_BF16; 8395 for (size_t k = 9; k < 16; k++) { 8396 GemmMicrokernelTester() 8397 .mr(5) 8398 .nr(4) 8399 .kr(8) 8400 .sr(1) 8401 .m(5) 8402 .n(4) 8403 .k(k) 8404 .a_stride(19) 8405 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8406 } 8407 } 8408 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_gt_8_subtile)8409 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_gt_8_subtile) { 8410 TEST_REQUIRES_ARM_NEON_BF16; 8411 for (size_t k = 9; k < 16; k++) { 8412 for (uint32_t n = 1; n <= 4; n++) { 8413 for (uint32_t m = 1; m <= 5; m++) { 8414 GemmMicrokernelTester() 8415 .mr(5) 8416 .nr(4) 8417 .kr(8) 8418 .sr(1) 8419 .m(m) 8420 .n(n) 8421 .k(k) 8422 .iterations(1) 8423 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8424 } 8425 } 8426 } 8427 } 8428 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_div_8)8429 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_div_8) { 8430 TEST_REQUIRES_ARM_NEON_BF16; 8431 for (size_t k = 16; k <= 80; k += 8) { 8432 GemmMicrokernelTester() 8433 .mr(5) 8434 .nr(4) 8435 .kr(8) 8436 .sr(1) 8437 .m(5) 8438 .n(4) 8439 .k(k) 8440 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8441 } 8442 } 8443 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_div_8_strided_a)8444 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_div_8_strided_a) { 8445 TEST_REQUIRES_ARM_NEON_BF16; 8446 for (size_t k = 16; k <= 80; k += 8) { 8447 GemmMicrokernelTester() 8448 .mr(5) 8449 .nr(4) 8450 .kr(8) 8451 .sr(1) 8452 .m(5) 8453 .n(4) 8454 .k(k) 8455 .a_stride(83) 8456 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8457 } 8458 } 8459 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,k_div_8_subtile)8460 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, k_div_8_subtile) { 8461 TEST_REQUIRES_ARM_NEON_BF16; 8462 for (size_t k = 16; k <= 80; k += 8) { 8463 for (uint32_t n = 1; n <= 4; n++) { 8464 for (uint32_t m = 1; m <= 5; m++) { 8465 GemmMicrokernelTester() 8466 .mr(5) 8467 .nr(4) 8468 .kr(8) 8469 .sr(1) 8470 .m(m) 8471 .n(n) 8472 .k(k) 8473 .iterations(1) 8474 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8475 } 8476 } 8477 } 8478 } 8479 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,n_gt_4)8480 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, n_gt_4) { 8481 TEST_REQUIRES_ARM_NEON_BF16; 8482 for (uint32_t n = 5; n < 8; n++) { 8483 for (size_t k = 1; k <= 40; k += 9) { 8484 GemmMicrokernelTester() 8485 .mr(5) 8486 .nr(4) 8487 .kr(8) 8488 .sr(1) 8489 .m(5) 8490 .n(n) 8491 .k(k) 8492 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8493 } 8494 } 8495 } 8496 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,n_gt_4_strided_cn)8497 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, n_gt_4_strided_cn) { 8498 TEST_REQUIRES_ARM_NEON_BF16; 8499 for (uint32_t n = 5; n < 8; n++) { 8500 for (size_t k = 1; k <= 40; k += 9) { 8501 GemmMicrokernelTester() 8502 .mr(5) 8503 .nr(4) 8504 .kr(8) 8505 .sr(1) 8506 .m(5) 8507 .n(n) 8508 .k(k) 8509 .cn_stride(7) 8510 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8511 } 8512 } 8513 } 8514 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,n_gt_4_strided_a)8515 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, n_gt_4_strided_a) { 8516 TEST_REQUIRES_ARM_NEON_BF16; 8517 for (uint32_t n = 5; n < 8; n++) { 8518 for (size_t k = 1; k <= 40; k += 9) { 8519 GemmMicrokernelTester() 8520 .mr(5) 8521 .nr(4) 8522 .kr(8) 8523 .sr(1) 8524 .m(5) 8525 .n(n) 8526 .k(k) 8527 .a_stride(43) 8528 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8529 } 8530 } 8531 } 8532 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,n_gt_4_subtile)8533 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, n_gt_4_subtile) { 8534 TEST_REQUIRES_ARM_NEON_BF16; 8535 for (uint32_t n = 5; n < 8; n++) { 8536 for (size_t k = 1; k <= 40; k += 9) { 8537 for (uint32_t m = 1; m <= 5; m++) { 8538 GemmMicrokernelTester() 8539 .mr(5) 8540 .nr(4) 8541 .kr(8) 8542 .sr(1) 8543 .m(m) 8544 .n(n) 8545 .k(k) 8546 .iterations(1) 8547 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8548 } 8549 } 8550 } 8551 } 8552 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,n_div_4)8553 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, n_div_4) { 8554 TEST_REQUIRES_ARM_NEON_BF16; 8555 for (uint32_t n = 8; n <= 12; n += 4) { 8556 for (size_t k = 1; k <= 40; k += 9) { 8557 GemmMicrokernelTester() 8558 .mr(5) 8559 .nr(4) 8560 .kr(8) 8561 .sr(1) 8562 .m(5) 8563 .n(n) 8564 .k(k) 8565 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8566 } 8567 } 8568 } 8569 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,n_div_4_strided_cn)8570 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, n_div_4_strided_cn) { 8571 TEST_REQUIRES_ARM_NEON_BF16; 8572 for (uint32_t n = 8; n <= 12; n += 4) { 8573 for (size_t k = 1; k <= 40; k += 9) { 8574 GemmMicrokernelTester() 8575 .mr(5) 8576 .nr(4) 8577 .kr(8) 8578 .sr(1) 8579 .m(5) 8580 .n(n) 8581 .k(k) 8582 .cn_stride(7) 8583 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8584 } 8585 } 8586 } 8587 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,n_div_4_strided_a)8588 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, n_div_4_strided_a) { 8589 TEST_REQUIRES_ARM_NEON_BF16; 8590 for (uint32_t n = 8; n <= 12; n += 4) { 8591 for (size_t k = 1; k <= 40; k += 9) { 8592 GemmMicrokernelTester() 8593 .mr(5) 8594 .nr(4) 8595 .kr(8) 8596 .sr(1) 8597 .m(5) 8598 .n(n) 8599 .k(k) 8600 .a_stride(43) 8601 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8602 } 8603 } 8604 } 8605 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,n_div_4_subtile)8606 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, n_div_4_subtile) { 8607 TEST_REQUIRES_ARM_NEON_BF16; 8608 for (uint32_t n = 8; n <= 12; n += 4) { 8609 for (size_t k = 1; k <= 40; k += 9) { 8610 for (uint32_t m = 1; m <= 5; m++) { 8611 GemmMicrokernelTester() 8612 .mr(5) 8613 .nr(4) 8614 .kr(8) 8615 .sr(1) 8616 .m(m) 8617 .n(n) 8618 .k(k) 8619 .iterations(1) 8620 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8621 } 8622 } 8623 } 8624 } 8625 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,strided_cm_subtile)8626 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, strided_cm_subtile) { 8627 TEST_REQUIRES_ARM_NEON_BF16; 8628 for (size_t k = 1; k <= 40; k += 9) { 8629 for (uint32_t n = 1; n <= 4; n++) { 8630 for (uint32_t m = 1; m <= 5; m++) { 8631 GemmMicrokernelTester() 8632 .mr(5) 8633 .nr(4) 8634 .kr(8) 8635 .sr(1) 8636 .m(m) 8637 .n(n) 8638 .k(k) 8639 .cm_stride(7) 8640 .iterations(1) 8641 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8642 } 8643 } 8644 } 8645 } 8646 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,qmin)8647 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, qmin) { 8648 TEST_REQUIRES_ARM_NEON_BF16; 8649 GemmMicrokernelTester() 8650 .mr(5) 8651 .nr(4) 8652 .kr(8) 8653 .sr(1) 8654 .m(5) 8655 .n(4) 8656 .k(8) 8657 .qmin(128) 8658 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8659 } 8660 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,qmax)8661 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, qmax) { 8662 TEST_REQUIRES_ARM_NEON_BF16; 8663 GemmMicrokernelTester() 8664 .mr(5) 8665 .nr(4) 8666 .kr(8) 8667 .sr(1) 8668 .m(5) 8669 .n(4) 8670 .k(8) 8671 .qmax(128) 8672 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8673 } 8674 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT,strided_cm)8675 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFDOT, strided_cm) { 8676 TEST_REQUIRES_ARM_NEON_BF16; 8677 GemmMicrokernelTester() 8678 .mr(5) 8679 .nr(4) 8680 .kr(8) 8681 .sr(1) 8682 .m(5) 8683 .n(4) 8684 .k(8) 8685 .cm_stride(7) 8686 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfdot, xnn_init_bf16_minmax_scalar_params); 8687 } 8688 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 8689 8690 8691 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_eq_8)8692 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_eq_8) { 8693 TEST_REQUIRES_ARM_NEON_BF16; 8694 GemmMicrokernelTester() 8695 .mr(1) 8696 .nr(4) 8697 .kr(8) 8698 .sr(1) 8699 .m(1) 8700 .n(4) 8701 .k(8) 8702 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8703 } 8704 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,strided_cn)8705 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, strided_cn) { 8706 TEST_REQUIRES_ARM_NEON_BF16; 8707 GemmMicrokernelTester() 8708 .mr(1) 8709 .nr(4) 8710 .kr(8) 8711 .sr(1) 8712 .m(1) 8713 .n(4) 8714 .k(8) 8715 .cn_stride(7) 8716 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8717 } 8718 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_eq_8_strided_a)8719 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_eq_8_strided_a) { 8720 TEST_REQUIRES_ARM_NEON_BF16; 8721 GemmMicrokernelTester() 8722 .mr(1) 8723 .nr(4) 8724 .kr(8) 8725 .sr(1) 8726 .m(1) 8727 .n(4) 8728 .k(8) 8729 .a_stride(11) 8730 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8731 } 8732 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_eq_8_subtile)8733 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_eq_8_subtile) { 8734 TEST_REQUIRES_ARM_NEON_BF16; 8735 for (uint32_t n = 1; n <= 4; n++) { 8736 for (uint32_t m = 1; m <= 1; m++) { 8737 GemmMicrokernelTester() 8738 .mr(1) 8739 .nr(4) 8740 .kr(8) 8741 .sr(1) 8742 .m(m) 8743 .n(n) 8744 .k(8) 8745 .iterations(1) 8746 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8747 } 8748 } 8749 } 8750 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_m)8751 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_m) { 8752 TEST_REQUIRES_ARM_NEON_BF16; 8753 for (uint32_t m = 1; m <= 1; m++) { 8754 GemmMicrokernelTester() 8755 .mr(1) 8756 .nr(4) 8757 .kr(8) 8758 .sr(1) 8759 .m(m) 8760 .n(4) 8761 .k(8) 8762 .iterations(1) 8763 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8764 } 8765 } 8766 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_n)8767 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_n) { 8768 TEST_REQUIRES_ARM_NEON_BF16; 8769 for (uint32_t n = 1; n <= 4; n++) { 8770 GemmMicrokernelTester() 8771 .mr(1) 8772 .nr(4) 8773 .kr(8) 8774 .sr(1) 8775 .m(1) 8776 .n(n) 8777 .k(8) 8778 .iterations(1) 8779 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8780 } 8781 } 8782 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_lt_8)8783 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_lt_8) { 8784 TEST_REQUIRES_ARM_NEON_BF16; 8785 for (size_t k = 1; k < 8; k++) { 8786 GemmMicrokernelTester() 8787 .mr(1) 8788 .nr(4) 8789 .kr(8) 8790 .sr(1) 8791 .m(1) 8792 .n(4) 8793 .k(k) 8794 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8795 } 8796 } 8797 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_lt_8_strided_a)8798 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_lt_8_strided_a) { 8799 TEST_REQUIRES_ARM_NEON_BF16; 8800 for (size_t k = 1; k < 8; k++) { 8801 GemmMicrokernelTester() 8802 .mr(1) 8803 .nr(4) 8804 .kr(8) 8805 .sr(1) 8806 .m(1) 8807 .n(4) 8808 .k(k) 8809 .a_stride(11) 8810 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8811 } 8812 } 8813 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_lt_8_subtile)8814 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_lt_8_subtile) { 8815 TEST_REQUIRES_ARM_NEON_BF16; 8816 for (size_t k = 1; k < 8; k++) { 8817 for (uint32_t n = 1; n <= 4; n++) { 8818 for (uint32_t m = 1; m <= 1; m++) { 8819 GemmMicrokernelTester() 8820 .mr(1) 8821 .nr(4) 8822 .kr(8) 8823 .sr(1) 8824 .m(m) 8825 .n(n) 8826 .k(k) 8827 .iterations(1) 8828 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8829 } 8830 } 8831 } 8832 } 8833 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_gt_8)8834 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_gt_8) { 8835 TEST_REQUIRES_ARM_NEON_BF16; 8836 for (size_t k = 9; k < 16; k++) { 8837 GemmMicrokernelTester() 8838 .mr(1) 8839 .nr(4) 8840 .kr(8) 8841 .sr(1) 8842 .m(1) 8843 .n(4) 8844 .k(k) 8845 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8846 } 8847 } 8848 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_gt_8_strided_a)8849 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_gt_8_strided_a) { 8850 TEST_REQUIRES_ARM_NEON_BF16; 8851 for (size_t k = 9; k < 16; k++) { 8852 GemmMicrokernelTester() 8853 .mr(1) 8854 .nr(4) 8855 .kr(8) 8856 .sr(1) 8857 .m(1) 8858 .n(4) 8859 .k(k) 8860 .a_stride(19) 8861 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8862 } 8863 } 8864 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_gt_8_subtile)8865 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_gt_8_subtile) { 8866 TEST_REQUIRES_ARM_NEON_BF16; 8867 for (size_t k = 9; k < 16; k++) { 8868 for (uint32_t n = 1; n <= 4; n++) { 8869 for (uint32_t m = 1; m <= 1; m++) { 8870 GemmMicrokernelTester() 8871 .mr(1) 8872 .nr(4) 8873 .kr(8) 8874 .sr(1) 8875 .m(m) 8876 .n(n) 8877 .k(k) 8878 .iterations(1) 8879 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8880 } 8881 } 8882 } 8883 } 8884 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_div_8)8885 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_div_8) { 8886 TEST_REQUIRES_ARM_NEON_BF16; 8887 for (size_t k = 16; k <= 80; k += 8) { 8888 GemmMicrokernelTester() 8889 .mr(1) 8890 .nr(4) 8891 .kr(8) 8892 .sr(1) 8893 .m(1) 8894 .n(4) 8895 .k(k) 8896 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8897 } 8898 } 8899 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_div_8_strided_a)8900 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_div_8_strided_a) { 8901 TEST_REQUIRES_ARM_NEON_BF16; 8902 for (size_t k = 16; k <= 80; k += 8) { 8903 GemmMicrokernelTester() 8904 .mr(1) 8905 .nr(4) 8906 .kr(8) 8907 .sr(1) 8908 .m(1) 8909 .n(4) 8910 .k(k) 8911 .a_stride(83) 8912 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8913 } 8914 } 8915 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,k_div_8_subtile)8916 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, k_div_8_subtile) { 8917 TEST_REQUIRES_ARM_NEON_BF16; 8918 for (size_t k = 16; k <= 80; k += 8) { 8919 for (uint32_t n = 1; n <= 4; n++) { 8920 for (uint32_t m = 1; m <= 1; m++) { 8921 GemmMicrokernelTester() 8922 .mr(1) 8923 .nr(4) 8924 .kr(8) 8925 .sr(1) 8926 .m(m) 8927 .n(n) 8928 .k(k) 8929 .iterations(1) 8930 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8931 } 8932 } 8933 } 8934 } 8935 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,n_gt_4)8936 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, n_gt_4) { 8937 TEST_REQUIRES_ARM_NEON_BF16; 8938 for (uint32_t n = 5; n < 8; n++) { 8939 for (size_t k = 1; k <= 40; k += 9) { 8940 GemmMicrokernelTester() 8941 .mr(1) 8942 .nr(4) 8943 .kr(8) 8944 .sr(1) 8945 .m(1) 8946 .n(n) 8947 .k(k) 8948 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8949 } 8950 } 8951 } 8952 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,n_gt_4_strided_cn)8953 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, n_gt_4_strided_cn) { 8954 TEST_REQUIRES_ARM_NEON_BF16; 8955 for (uint32_t n = 5; n < 8; n++) { 8956 for (size_t k = 1; k <= 40; k += 9) { 8957 GemmMicrokernelTester() 8958 .mr(1) 8959 .nr(4) 8960 .kr(8) 8961 .sr(1) 8962 .m(1) 8963 .n(n) 8964 .k(k) 8965 .cn_stride(7) 8966 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8967 } 8968 } 8969 } 8970 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,n_gt_4_strided_a)8971 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, n_gt_4_strided_a) { 8972 TEST_REQUIRES_ARM_NEON_BF16; 8973 for (uint32_t n = 5; n < 8; n++) { 8974 for (size_t k = 1; k <= 40; k += 9) { 8975 GemmMicrokernelTester() 8976 .mr(1) 8977 .nr(4) 8978 .kr(8) 8979 .sr(1) 8980 .m(1) 8981 .n(n) 8982 .k(k) 8983 .a_stride(43) 8984 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 8985 } 8986 } 8987 } 8988 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,n_gt_4_subtile)8989 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, n_gt_4_subtile) { 8990 TEST_REQUIRES_ARM_NEON_BF16; 8991 for (uint32_t n = 5; n < 8; n++) { 8992 for (size_t k = 1; k <= 40; k += 9) { 8993 for (uint32_t m = 1; m <= 1; m++) { 8994 GemmMicrokernelTester() 8995 .mr(1) 8996 .nr(4) 8997 .kr(8) 8998 .sr(1) 8999 .m(m) 9000 .n(n) 9001 .k(k) 9002 .iterations(1) 9003 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9004 } 9005 } 9006 } 9007 } 9008 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,n_div_4)9009 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, n_div_4) { 9010 TEST_REQUIRES_ARM_NEON_BF16; 9011 for (uint32_t n = 8; n <= 12; n += 4) { 9012 for (size_t k = 1; k <= 40; k += 9) { 9013 GemmMicrokernelTester() 9014 .mr(1) 9015 .nr(4) 9016 .kr(8) 9017 .sr(1) 9018 .m(1) 9019 .n(n) 9020 .k(k) 9021 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9022 } 9023 } 9024 } 9025 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,n_div_4_strided_cn)9026 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, n_div_4_strided_cn) { 9027 TEST_REQUIRES_ARM_NEON_BF16; 9028 for (uint32_t n = 8; n <= 12; n += 4) { 9029 for (size_t k = 1; k <= 40; k += 9) { 9030 GemmMicrokernelTester() 9031 .mr(1) 9032 .nr(4) 9033 .kr(8) 9034 .sr(1) 9035 .m(1) 9036 .n(n) 9037 .k(k) 9038 .cn_stride(7) 9039 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9040 } 9041 } 9042 } 9043 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,n_div_4_strided_a)9044 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, n_div_4_strided_a) { 9045 TEST_REQUIRES_ARM_NEON_BF16; 9046 for (uint32_t n = 8; n <= 12; n += 4) { 9047 for (size_t k = 1; k <= 40; k += 9) { 9048 GemmMicrokernelTester() 9049 .mr(1) 9050 .nr(4) 9051 .kr(8) 9052 .sr(1) 9053 .m(1) 9054 .n(n) 9055 .k(k) 9056 .a_stride(43) 9057 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9058 } 9059 } 9060 } 9061 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,n_div_4_subtile)9062 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, n_div_4_subtile) { 9063 TEST_REQUIRES_ARM_NEON_BF16; 9064 for (uint32_t n = 8; n <= 12; n += 4) { 9065 for (size_t k = 1; k <= 40; k += 9) { 9066 for (uint32_t m = 1; m <= 1; m++) { 9067 GemmMicrokernelTester() 9068 .mr(1) 9069 .nr(4) 9070 .kr(8) 9071 .sr(1) 9072 .m(m) 9073 .n(n) 9074 .k(k) 9075 .iterations(1) 9076 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9077 } 9078 } 9079 } 9080 } 9081 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,strided_cm_subtile)9082 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, strided_cm_subtile) { 9083 TEST_REQUIRES_ARM_NEON_BF16; 9084 for (size_t k = 1; k <= 40; k += 9) { 9085 for (uint32_t n = 1; n <= 4; n++) { 9086 for (uint32_t m = 1; m <= 1; m++) { 9087 GemmMicrokernelTester() 9088 .mr(1) 9089 .nr(4) 9090 .kr(8) 9091 .sr(1) 9092 .m(m) 9093 .n(n) 9094 .k(k) 9095 .cm_stride(7) 9096 .iterations(1) 9097 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9098 } 9099 } 9100 } 9101 } 9102 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,qmin)9103 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, qmin) { 9104 TEST_REQUIRES_ARM_NEON_BF16; 9105 GemmMicrokernelTester() 9106 .mr(1) 9107 .nr(4) 9108 .kr(8) 9109 .sr(1) 9110 .m(1) 9111 .n(4) 9112 .k(8) 9113 .qmin(128) 9114 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9115 } 9116 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,qmax)9117 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, qmax) { 9118 TEST_REQUIRES_ARM_NEON_BF16; 9119 GemmMicrokernelTester() 9120 .mr(1) 9121 .nr(4) 9122 .kr(8) 9123 .sr(1) 9124 .m(1) 9125 .n(4) 9126 .k(8) 9127 .qmax(128) 9128 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9129 } 9130 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL,strided_cm)9131 TEST(BF16_GEMM_MINMAX_1X4C8__NEONBF16_BFMLAL, strided_cm) { 9132 TEST_REQUIRES_ARM_NEON_BF16; 9133 GemmMicrokernelTester() 9134 .mr(1) 9135 .nr(4) 9136 .kr(8) 9137 .sr(1) 9138 .m(1) 9139 .n(4) 9140 .k(8) 9141 .cm_stride(7) 9142 .Test(xnn_bf16_gemm_minmax_ukernel_1x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9143 } 9144 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 9145 9146 9147 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_eq_8)9148 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_eq_8) { 9149 TEST_REQUIRES_ARM_NEON_BF16; 9150 GemmMicrokernelTester() 9151 .mr(2) 9152 .nr(4) 9153 .kr(8) 9154 .sr(1) 9155 .m(2) 9156 .n(4) 9157 .k(8) 9158 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9159 } 9160 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,strided_cn)9161 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, strided_cn) { 9162 TEST_REQUIRES_ARM_NEON_BF16; 9163 GemmMicrokernelTester() 9164 .mr(2) 9165 .nr(4) 9166 .kr(8) 9167 .sr(1) 9168 .m(2) 9169 .n(4) 9170 .k(8) 9171 .cn_stride(7) 9172 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9173 } 9174 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_eq_8_strided_a)9175 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_eq_8_strided_a) { 9176 TEST_REQUIRES_ARM_NEON_BF16; 9177 GemmMicrokernelTester() 9178 .mr(2) 9179 .nr(4) 9180 .kr(8) 9181 .sr(1) 9182 .m(2) 9183 .n(4) 9184 .k(8) 9185 .a_stride(11) 9186 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9187 } 9188 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_eq_8_subtile)9189 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_eq_8_subtile) { 9190 TEST_REQUIRES_ARM_NEON_BF16; 9191 for (uint32_t n = 1; n <= 4; n++) { 9192 for (uint32_t m = 1; m <= 2; m++) { 9193 GemmMicrokernelTester() 9194 .mr(2) 9195 .nr(4) 9196 .kr(8) 9197 .sr(1) 9198 .m(m) 9199 .n(n) 9200 .k(8) 9201 .iterations(1) 9202 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9203 } 9204 } 9205 } 9206 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_m)9207 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_m) { 9208 TEST_REQUIRES_ARM_NEON_BF16; 9209 for (uint32_t m = 1; m <= 2; m++) { 9210 GemmMicrokernelTester() 9211 .mr(2) 9212 .nr(4) 9213 .kr(8) 9214 .sr(1) 9215 .m(m) 9216 .n(4) 9217 .k(8) 9218 .iterations(1) 9219 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9220 } 9221 } 9222 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_n)9223 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_n) { 9224 TEST_REQUIRES_ARM_NEON_BF16; 9225 for (uint32_t n = 1; n <= 4; n++) { 9226 GemmMicrokernelTester() 9227 .mr(2) 9228 .nr(4) 9229 .kr(8) 9230 .sr(1) 9231 .m(2) 9232 .n(n) 9233 .k(8) 9234 .iterations(1) 9235 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9236 } 9237 } 9238 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_lt_8)9239 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_lt_8) { 9240 TEST_REQUIRES_ARM_NEON_BF16; 9241 for (size_t k = 1; k < 8; k++) { 9242 GemmMicrokernelTester() 9243 .mr(2) 9244 .nr(4) 9245 .kr(8) 9246 .sr(1) 9247 .m(2) 9248 .n(4) 9249 .k(k) 9250 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9251 } 9252 } 9253 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_lt_8_strided_a)9254 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_lt_8_strided_a) { 9255 TEST_REQUIRES_ARM_NEON_BF16; 9256 for (size_t k = 1; k < 8; k++) { 9257 GemmMicrokernelTester() 9258 .mr(2) 9259 .nr(4) 9260 .kr(8) 9261 .sr(1) 9262 .m(2) 9263 .n(4) 9264 .k(k) 9265 .a_stride(11) 9266 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9267 } 9268 } 9269 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_lt_8_subtile)9270 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_lt_8_subtile) { 9271 TEST_REQUIRES_ARM_NEON_BF16; 9272 for (size_t k = 1; k < 8; k++) { 9273 for (uint32_t n = 1; n <= 4; n++) { 9274 for (uint32_t m = 1; m <= 2; m++) { 9275 GemmMicrokernelTester() 9276 .mr(2) 9277 .nr(4) 9278 .kr(8) 9279 .sr(1) 9280 .m(m) 9281 .n(n) 9282 .k(k) 9283 .iterations(1) 9284 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9285 } 9286 } 9287 } 9288 } 9289 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_gt_8)9290 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_gt_8) { 9291 TEST_REQUIRES_ARM_NEON_BF16; 9292 for (size_t k = 9; k < 16; k++) { 9293 GemmMicrokernelTester() 9294 .mr(2) 9295 .nr(4) 9296 .kr(8) 9297 .sr(1) 9298 .m(2) 9299 .n(4) 9300 .k(k) 9301 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9302 } 9303 } 9304 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_gt_8_strided_a)9305 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_gt_8_strided_a) { 9306 TEST_REQUIRES_ARM_NEON_BF16; 9307 for (size_t k = 9; k < 16; k++) { 9308 GemmMicrokernelTester() 9309 .mr(2) 9310 .nr(4) 9311 .kr(8) 9312 .sr(1) 9313 .m(2) 9314 .n(4) 9315 .k(k) 9316 .a_stride(19) 9317 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9318 } 9319 } 9320 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_gt_8_subtile)9321 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_gt_8_subtile) { 9322 TEST_REQUIRES_ARM_NEON_BF16; 9323 for (size_t k = 9; k < 16; k++) { 9324 for (uint32_t n = 1; n <= 4; n++) { 9325 for (uint32_t m = 1; m <= 2; m++) { 9326 GemmMicrokernelTester() 9327 .mr(2) 9328 .nr(4) 9329 .kr(8) 9330 .sr(1) 9331 .m(m) 9332 .n(n) 9333 .k(k) 9334 .iterations(1) 9335 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9336 } 9337 } 9338 } 9339 } 9340 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_div_8)9341 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_div_8) { 9342 TEST_REQUIRES_ARM_NEON_BF16; 9343 for (size_t k = 16; k <= 80; k += 8) { 9344 GemmMicrokernelTester() 9345 .mr(2) 9346 .nr(4) 9347 .kr(8) 9348 .sr(1) 9349 .m(2) 9350 .n(4) 9351 .k(k) 9352 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9353 } 9354 } 9355 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_div_8_strided_a)9356 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_div_8_strided_a) { 9357 TEST_REQUIRES_ARM_NEON_BF16; 9358 for (size_t k = 16; k <= 80; k += 8) { 9359 GemmMicrokernelTester() 9360 .mr(2) 9361 .nr(4) 9362 .kr(8) 9363 .sr(1) 9364 .m(2) 9365 .n(4) 9366 .k(k) 9367 .a_stride(83) 9368 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9369 } 9370 } 9371 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,k_div_8_subtile)9372 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, k_div_8_subtile) { 9373 TEST_REQUIRES_ARM_NEON_BF16; 9374 for (size_t k = 16; k <= 80; k += 8) { 9375 for (uint32_t n = 1; n <= 4; n++) { 9376 for (uint32_t m = 1; m <= 2; m++) { 9377 GemmMicrokernelTester() 9378 .mr(2) 9379 .nr(4) 9380 .kr(8) 9381 .sr(1) 9382 .m(m) 9383 .n(n) 9384 .k(k) 9385 .iterations(1) 9386 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9387 } 9388 } 9389 } 9390 } 9391 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,n_gt_4)9392 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, n_gt_4) { 9393 TEST_REQUIRES_ARM_NEON_BF16; 9394 for (uint32_t n = 5; n < 8; n++) { 9395 for (size_t k = 1; k <= 40; k += 9) { 9396 GemmMicrokernelTester() 9397 .mr(2) 9398 .nr(4) 9399 .kr(8) 9400 .sr(1) 9401 .m(2) 9402 .n(n) 9403 .k(k) 9404 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9405 } 9406 } 9407 } 9408 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,n_gt_4_strided_cn)9409 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, n_gt_4_strided_cn) { 9410 TEST_REQUIRES_ARM_NEON_BF16; 9411 for (uint32_t n = 5; n < 8; n++) { 9412 for (size_t k = 1; k <= 40; k += 9) { 9413 GemmMicrokernelTester() 9414 .mr(2) 9415 .nr(4) 9416 .kr(8) 9417 .sr(1) 9418 .m(2) 9419 .n(n) 9420 .k(k) 9421 .cn_stride(7) 9422 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9423 } 9424 } 9425 } 9426 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,n_gt_4_strided_a)9427 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, n_gt_4_strided_a) { 9428 TEST_REQUIRES_ARM_NEON_BF16; 9429 for (uint32_t n = 5; n < 8; n++) { 9430 for (size_t k = 1; k <= 40; k += 9) { 9431 GemmMicrokernelTester() 9432 .mr(2) 9433 .nr(4) 9434 .kr(8) 9435 .sr(1) 9436 .m(2) 9437 .n(n) 9438 .k(k) 9439 .a_stride(43) 9440 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9441 } 9442 } 9443 } 9444 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,n_gt_4_subtile)9445 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, n_gt_4_subtile) { 9446 TEST_REQUIRES_ARM_NEON_BF16; 9447 for (uint32_t n = 5; n < 8; n++) { 9448 for (size_t k = 1; k <= 40; k += 9) { 9449 for (uint32_t m = 1; m <= 2; m++) { 9450 GemmMicrokernelTester() 9451 .mr(2) 9452 .nr(4) 9453 .kr(8) 9454 .sr(1) 9455 .m(m) 9456 .n(n) 9457 .k(k) 9458 .iterations(1) 9459 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9460 } 9461 } 9462 } 9463 } 9464 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,n_div_4)9465 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, n_div_4) { 9466 TEST_REQUIRES_ARM_NEON_BF16; 9467 for (uint32_t n = 8; n <= 12; n += 4) { 9468 for (size_t k = 1; k <= 40; k += 9) { 9469 GemmMicrokernelTester() 9470 .mr(2) 9471 .nr(4) 9472 .kr(8) 9473 .sr(1) 9474 .m(2) 9475 .n(n) 9476 .k(k) 9477 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9478 } 9479 } 9480 } 9481 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,n_div_4_strided_cn)9482 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, n_div_4_strided_cn) { 9483 TEST_REQUIRES_ARM_NEON_BF16; 9484 for (uint32_t n = 8; n <= 12; n += 4) { 9485 for (size_t k = 1; k <= 40; k += 9) { 9486 GemmMicrokernelTester() 9487 .mr(2) 9488 .nr(4) 9489 .kr(8) 9490 .sr(1) 9491 .m(2) 9492 .n(n) 9493 .k(k) 9494 .cn_stride(7) 9495 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9496 } 9497 } 9498 } 9499 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,n_div_4_strided_a)9500 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, n_div_4_strided_a) { 9501 TEST_REQUIRES_ARM_NEON_BF16; 9502 for (uint32_t n = 8; n <= 12; n += 4) { 9503 for (size_t k = 1; k <= 40; k += 9) { 9504 GemmMicrokernelTester() 9505 .mr(2) 9506 .nr(4) 9507 .kr(8) 9508 .sr(1) 9509 .m(2) 9510 .n(n) 9511 .k(k) 9512 .a_stride(43) 9513 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9514 } 9515 } 9516 } 9517 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,n_div_4_subtile)9518 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, n_div_4_subtile) { 9519 TEST_REQUIRES_ARM_NEON_BF16; 9520 for (uint32_t n = 8; n <= 12; n += 4) { 9521 for (size_t k = 1; k <= 40; k += 9) { 9522 for (uint32_t m = 1; m <= 2; m++) { 9523 GemmMicrokernelTester() 9524 .mr(2) 9525 .nr(4) 9526 .kr(8) 9527 .sr(1) 9528 .m(m) 9529 .n(n) 9530 .k(k) 9531 .iterations(1) 9532 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9533 } 9534 } 9535 } 9536 } 9537 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,strided_cm_subtile)9538 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, strided_cm_subtile) { 9539 TEST_REQUIRES_ARM_NEON_BF16; 9540 for (size_t k = 1; k <= 40; k += 9) { 9541 for (uint32_t n = 1; n <= 4; n++) { 9542 for (uint32_t m = 1; m <= 2; m++) { 9543 GemmMicrokernelTester() 9544 .mr(2) 9545 .nr(4) 9546 .kr(8) 9547 .sr(1) 9548 .m(m) 9549 .n(n) 9550 .k(k) 9551 .cm_stride(7) 9552 .iterations(1) 9553 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9554 } 9555 } 9556 } 9557 } 9558 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,qmin)9559 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, qmin) { 9560 TEST_REQUIRES_ARM_NEON_BF16; 9561 GemmMicrokernelTester() 9562 .mr(2) 9563 .nr(4) 9564 .kr(8) 9565 .sr(1) 9566 .m(2) 9567 .n(4) 9568 .k(8) 9569 .qmin(128) 9570 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9571 } 9572 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,qmax)9573 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, qmax) { 9574 TEST_REQUIRES_ARM_NEON_BF16; 9575 GemmMicrokernelTester() 9576 .mr(2) 9577 .nr(4) 9578 .kr(8) 9579 .sr(1) 9580 .m(2) 9581 .n(4) 9582 .k(8) 9583 .qmax(128) 9584 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9585 } 9586 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL,strided_cm)9587 TEST(BF16_GEMM_MINMAX_2X4C8__NEONBF16_BFMLAL, strided_cm) { 9588 TEST_REQUIRES_ARM_NEON_BF16; 9589 GemmMicrokernelTester() 9590 .mr(2) 9591 .nr(4) 9592 .kr(8) 9593 .sr(1) 9594 .m(2) 9595 .n(4) 9596 .k(8) 9597 .cm_stride(7) 9598 .Test(xnn_bf16_gemm_minmax_ukernel_2x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9599 } 9600 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 9601 9602 9603 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_eq_8)9604 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_eq_8) { 9605 TEST_REQUIRES_ARM_NEON_BF16; 9606 GemmMicrokernelTester() 9607 .mr(3) 9608 .nr(4) 9609 .kr(8) 9610 .sr(1) 9611 .m(3) 9612 .n(4) 9613 .k(8) 9614 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9615 } 9616 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,strided_cn)9617 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, strided_cn) { 9618 TEST_REQUIRES_ARM_NEON_BF16; 9619 GemmMicrokernelTester() 9620 .mr(3) 9621 .nr(4) 9622 .kr(8) 9623 .sr(1) 9624 .m(3) 9625 .n(4) 9626 .k(8) 9627 .cn_stride(7) 9628 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9629 } 9630 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_eq_8_strided_a)9631 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_eq_8_strided_a) { 9632 TEST_REQUIRES_ARM_NEON_BF16; 9633 GemmMicrokernelTester() 9634 .mr(3) 9635 .nr(4) 9636 .kr(8) 9637 .sr(1) 9638 .m(3) 9639 .n(4) 9640 .k(8) 9641 .a_stride(11) 9642 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9643 } 9644 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_eq_8_subtile)9645 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_eq_8_subtile) { 9646 TEST_REQUIRES_ARM_NEON_BF16; 9647 for (uint32_t n = 1; n <= 4; n++) { 9648 for (uint32_t m = 1; m <= 3; m++) { 9649 GemmMicrokernelTester() 9650 .mr(3) 9651 .nr(4) 9652 .kr(8) 9653 .sr(1) 9654 .m(m) 9655 .n(n) 9656 .k(8) 9657 .iterations(1) 9658 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9659 } 9660 } 9661 } 9662 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_m)9663 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_m) { 9664 TEST_REQUIRES_ARM_NEON_BF16; 9665 for (uint32_t m = 1; m <= 3; m++) { 9666 GemmMicrokernelTester() 9667 .mr(3) 9668 .nr(4) 9669 .kr(8) 9670 .sr(1) 9671 .m(m) 9672 .n(4) 9673 .k(8) 9674 .iterations(1) 9675 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9676 } 9677 } 9678 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_n)9679 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_n) { 9680 TEST_REQUIRES_ARM_NEON_BF16; 9681 for (uint32_t n = 1; n <= 4; n++) { 9682 GemmMicrokernelTester() 9683 .mr(3) 9684 .nr(4) 9685 .kr(8) 9686 .sr(1) 9687 .m(3) 9688 .n(n) 9689 .k(8) 9690 .iterations(1) 9691 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9692 } 9693 } 9694 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_lt_8)9695 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_lt_8) { 9696 TEST_REQUIRES_ARM_NEON_BF16; 9697 for (size_t k = 1; k < 8; k++) { 9698 GemmMicrokernelTester() 9699 .mr(3) 9700 .nr(4) 9701 .kr(8) 9702 .sr(1) 9703 .m(3) 9704 .n(4) 9705 .k(k) 9706 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9707 } 9708 } 9709 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_lt_8_strided_a)9710 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_lt_8_strided_a) { 9711 TEST_REQUIRES_ARM_NEON_BF16; 9712 for (size_t k = 1; k < 8; k++) { 9713 GemmMicrokernelTester() 9714 .mr(3) 9715 .nr(4) 9716 .kr(8) 9717 .sr(1) 9718 .m(3) 9719 .n(4) 9720 .k(k) 9721 .a_stride(11) 9722 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9723 } 9724 } 9725 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_lt_8_subtile)9726 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_lt_8_subtile) { 9727 TEST_REQUIRES_ARM_NEON_BF16; 9728 for (size_t k = 1; k < 8; k++) { 9729 for (uint32_t n = 1; n <= 4; n++) { 9730 for (uint32_t m = 1; m <= 3; m++) { 9731 GemmMicrokernelTester() 9732 .mr(3) 9733 .nr(4) 9734 .kr(8) 9735 .sr(1) 9736 .m(m) 9737 .n(n) 9738 .k(k) 9739 .iterations(1) 9740 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9741 } 9742 } 9743 } 9744 } 9745 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_gt_8)9746 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_gt_8) { 9747 TEST_REQUIRES_ARM_NEON_BF16; 9748 for (size_t k = 9; k < 16; k++) { 9749 GemmMicrokernelTester() 9750 .mr(3) 9751 .nr(4) 9752 .kr(8) 9753 .sr(1) 9754 .m(3) 9755 .n(4) 9756 .k(k) 9757 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9758 } 9759 } 9760 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_gt_8_strided_a)9761 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_gt_8_strided_a) { 9762 TEST_REQUIRES_ARM_NEON_BF16; 9763 for (size_t k = 9; k < 16; k++) { 9764 GemmMicrokernelTester() 9765 .mr(3) 9766 .nr(4) 9767 .kr(8) 9768 .sr(1) 9769 .m(3) 9770 .n(4) 9771 .k(k) 9772 .a_stride(19) 9773 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9774 } 9775 } 9776 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_gt_8_subtile)9777 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_gt_8_subtile) { 9778 TEST_REQUIRES_ARM_NEON_BF16; 9779 for (size_t k = 9; k < 16; k++) { 9780 for (uint32_t n = 1; n <= 4; n++) { 9781 for (uint32_t m = 1; m <= 3; m++) { 9782 GemmMicrokernelTester() 9783 .mr(3) 9784 .nr(4) 9785 .kr(8) 9786 .sr(1) 9787 .m(m) 9788 .n(n) 9789 .k(k) 9790 .iterations(1) 9791 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9792 } 9793 } 9794 } 9795 } 9796 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_div_8)9797 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_div_8) { 9798 TEST_REQUIRES_ARM_NEON_BF16; 9799 for (size_t k = 16; k <= 80; k += 8) { 9800 GemmMicrokernelTester() 9801 .mr(3) 9802 .nr(4) 9803 .kr(8) 9804 .sr(1) 9805 .m(3) 9806 .n(4) 9807 .k(k) 9808 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9809 } 9810 } 9811 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_div_8_strided_a)9812 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_div_8_strided_a) { 9813 TEST_REQUIRES_ARM_NEON_BF16; 9814 for (size_t k = 16; k <= 80; k += 8) { 9815 GemmMicrokernelTester() 9816 .mr(3) 9817 .nr(4) 9818 .kr(8) 9819 .sr(1) 9820 .m(3) 9821 .n(4) 9822 .k(k) 9823 .a_stride(83) 9824 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9825 } 9826 } 9827 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,k_div_8_subtile)9828 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, k_div_8_subtile) { 9829 TEST_REQUIRES_ARM_NEON_BF16; 9830 for (size_t k = 16; k <= 80; k += 8) { 9831 for (uint32_t n = 1; n <= 4; n++) { 9832 for (uint32_t m = 1; m <= 3; m++) { 9833 GemmMicrokernelTester() 9834 .mr(3) 9835 .nr(4) 9836 .kr(8) 9837 .sr(1) 9838 .m(m) 9839 .n(n) 9840 .k(k) 9841 .iterations(1) 9842 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9843 } 9844 } 9845 } 9846 } 9847 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,n_gt_4)9848 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, n_gt_4) { 9849 TEST_REQUIRES_ARM_NEON_BF16; 9850 for (uint32_t n = 5; n < 8; n++) { 9851 for (size_t k = 1; k <= 40; k += 9) { 9852 GemmMicrokernelTester() 9853 .mr(3) 9854 .nr(4) 9855 .kr(8) 9856 .sr(1) 9857 .m(3) 9858 .n(n) 9859 .k(k) 9860 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9861 } 9862 } 9863 } 9864 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,n_gt_4_strided_cn)9865 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, n_gt_4_strided_cn) { 9866 TEST_REQUIRES_ARM_NEON_BF16; 9867 for (uint32_t n = 5; n < 8; n++) { 9868 for (size_t k = 1; k <= 40; k += 9) { 9869 GemmMicrokernelTester() 9870 .mr(3) 9871 .nr(4) 9872 .kr(8) 9873 .sr(1) 9874 .m(3) 9875 .n(n) 9876 .k(k) 9877 .cn_stride(7) 9878 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9879 } 9880 } 9881 } 9882 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,n_gt_4_strided_a)9883 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, n_gt_4_strided_a) { 9884 TEST_REQUIRES_ARM_NEON_BF16; 9885 for (uint32_t n = 5; n < 8; n++) { 9886 for (size_t k = 1; k <= 40; k += 9) { 9887 GemmMicrokernelTester() 9888 .mr(3) 9889 .nr(4) 9890 .kr(8) 9891 .sr(1) 9892 .m(3) 9893 .n(n) 9894 .k(k) 9895 .a_stride(43) 9896 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9897 } 9898 } 9899 } 9900 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,n_gt_4_subtile)9901 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, n_gt_4_subtile) { 9902 TEST_REQUIRES_ARM_NEON_BF16; 9903 for (uint32_t n = 5; n < 8; n++) { 9904 for (size_t k = 1; k <= 40; k += 9) { 9905 for (uint32_t m = 1; m <= 3; m++) { 9906 GemmMicrokernelTester() 9907 .mr(3) 9908 .nr(4) 9909 .kr(8) 9910 .sr(1) 9911 .m(m) 9912 .n(n) 9913 .k(k) 9914 .iterations(1) 9915 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9916 } 9917 } 9918 } 9919 } 9920 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,n_div_4)9921 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, n_div_4) { 9922 TEST_REQUIRES_ARM_NEON_BF16; 9923 for (uint32_t n = 8; n <= 12; n += 4) { 9924 for (size_t k = 1; k <= 40; k += 9) { 9925 GemmMicrokernelTester() 9926 .mr(3) 9927 .nr(4) 9928 .kr(8) 9929 .sr(1) 9930 .m(3) 9931 .n(n) 9932 .k(k) 9933 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9934 } 9935 } 9936 } 9937 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,n_div_4_strided_cn)9938 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, n_div_4_strided_cn) { 9939 TEST_REQUIRES_ARM_NEON_BF16; 9940 for (uint32_t n = 8; n <= 12; n += 4) { 9941 for (size_t k = 1; k <= 40; k += 9) { 9942 GemmMicrokernelTester() 9943 .mr(3) 9944 .nr(4) 9945 .kr(8) 9946 .sr(1) 9947 .m(3) 9948 .n(n) 9949 .k(k) 9950 .cn_stride(7) 9951 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9952 } 9953 } 9954 } 9955 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,n_div_4_strided_a)9956 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, n_div_4_strided_a) { 9957 TEST_REQUIRES_ARM_NEON_BF16; 9958 for (uint32_t n = 8; n <= 12; n += 4) { 9959 for (size_t k = 1; k <= 40; k += 9) { 9960 GemmMicrokernelTester() 9961 .mr(3) 9962 .nr(4) 9963 .kr(8) 9964 .sr(1) 9965 .m(3) 9966 .n(n) 9967 .k(k) 9968 .a_stride(43) 9969 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9970 } 9971 } 9972 } 9973 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,n_div_4_subtile)9974 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, n_div_4_subtile) { 9975 TEST_REQUIRES_ARM_NEON_BF16; 9976 for (uint32_t n = 8; n <= 12; n += 4) { 9977 for (size_t k = 1; k <= 40; k += 9) { 9978 for (uint32_t m = 1; m <= 3; m++) { 9979 GemmMicrokernelTester() 9980 .mr(3) 9981 .nr(4) 9982 .kr(8) 9983 .sr(1) 9984 .m(m) 9985 .n(n) 9986 .k(k) 9987 .iterations(1) 9988 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 9989 } 9990 } 9991 } 9992 } 9993 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,strided_cm_subtile)9994 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, strided_cm_subtile) { 9995 TEST_REQUIRES_ARM_NEON_BF16; 9996 for (size_t k = 1; k <= 40; k += 9) { 9997 for (uint32_t n = 1; n <= 4; n++) { 9998 for (uint32_t m = 1; m <= 3; m++) { 9999 GemmMicrokernelTester() 10000 .mr(3) 10001 .nr(4) 10002 .kr(8) 10003 .sr(1) 10004 .m(m) 10005 .n(n) 10006 .k(k) 10007 .cm_stride(7) 10008 .iterations(1) 10009 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10010 } 10011 } 10012 } 10013 } 10014 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,qmin)10015 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, qmin) { 10016 TEST_REQUIRES_ARM_NEON_BF16; 10017 GemmMicrokernelTester() 10018 .mr(3) 10019 .nr(4) 10020 .kr(8) 10021 .sr(1) 10022 .m(3) 10023 .n(4) 10024 .k(8) 10025 .qmin(128) 10026 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10027 } 10028 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,qmax)10029 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, qmax) { 10030 TEST_REQUIRES_ARM_NEON_BF16; 10031 GemmMicrokernelTester() 10032 .mr(3) 10033 .nr(4) 10034 .kr(8) 10035 .sr(1) 10036 .m(3) 10037 .n(4) 10038 .k(8) 10039 .qmax(128) 10040 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10041 } 10042 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL,strided_cm)10043 TEST(BF16_GEMM_MINMAX_3X4C8__NEONBF16_BFMLAL, strided_cm) { 10044 TEST_REQUIRES_ARM_NEON_BF16; 10045 GemmMicrokernelTester() 10046 .mr(3) 10047 .nr(4) 10048 .kr(8) 10049 .sr(1) 10050 .m(3) 10051 .n(4) 10052 .k(8) 10053 .cm_stride(7) 10054 .Test(xnn_bf16_gemm_minmax_ukernel_3x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10055 } 10056 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 10057 10058 10059 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_eq_8)10060 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_eq_8) { 10061 TEST_REQUIRES_ARM_NEON_BF16; 10062 GemmMicrokernelTester() 10063 .mr(4) 10064 .nr(4) 10065 .kr(8) 10066 .sr(1) 10067 .m(4) 10068 .n(4) 10069 .k(8) 10070 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10071 } 10072 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,strided_cn)10073 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, strided_cn) { 10074 TEST_REQUIRES_ARM_NEON_BF16; 10075 GemmMicrokernelTester() 10076 .mr(4) 10077 .nr(4) 10078 .kr(8) 10079 .sr(1) 10080 .m(4) 10081 .n(4) 10082 .k(8) 10083 .cn_stride(7) 10084 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10085 } 10086 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_eq_8_strided_a)10087 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_eq_8_strided_a) { 10088 TEST_REQUIRES_ARM_NEON_BF16; 10089 GemmMicrokernelTester() 10090 .mr(4) 10091 .nr(4) 10092 .kr(8) 10093 .sr(1) 10094 .m(4) 10095 .n(4) 10096 .k(8) 10097 .a_stride(11) 10098 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10099 } 10100 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_eq_8_subtile)10101 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_eq_8_subtile) { 10102 TEST_REQUIRES_ARM_NEON_BF16; 10103 for (uint32_t n = 1; n <= 4; n++) { 10104 for (uint32_t m = 1; m <= 4; m++) { 10105 GemmMicrokernelTester() 10106 .mr(4) 10107 .nr(4) 10108 .kr(8) 10109 .sr(1) 10110 .m(m) 10111 .n(n) 10112 .k(8) 10113 .iterations(1) 10114 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10115 } 10116 } 10117 } 10118 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_m)10119 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_m) { 10120 TEST_REQUIRES_ARM_NEON_BF16; 10121 for (uint32_t m = 1; m <= 4; m++) { 10122 GemmMicrokernelTester() 10123 .mr(4) 10124 .nr(4) 10125 .kr(8) 10126 .sr(1) 10127 .m(m) 10128 .n(4) 10129 .k(8) 10130 .iterations(1) 10131 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10132 } 10133 } 10134 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_n)10135 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_n) { 10136 TEST_REQUIRES_ARM_NEON_BF16; 10137 for (uint32_t n = 1; n <= 4; n++) { 10138 GemmMicrokernelTester() 10139 .mr(4) 10140 .nr(4) 10141 .kr(8) 10142 .sr(1) 10143 .m(4) 10144 .n(n) 10145 .k(8) 10146 .iterations(1) 10147 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10148 } 10149 } 10150 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_lt_8)10151 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_lt_8) { 10152 TEST_REQUIRES_ARM_NEON_BF16; 10153 for (size_t k = 1; k < 8; k++) { 10154 GemmMicrokernelTester() 10155 .mr(4) 10156 .nr(4) 10157 .kr(8) 10158 .sr(1) 10159 .m(4) 10160 .n(4) 10161 .k(k) 10162 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10163 } 10164 } 10165 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_lt_8_strided_a)10166 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_lt_8_strided_a) { 10167 TEST_REQUIRES_ARM_NEON_BF16; 10168 for (size_t k = 1; k < 8; k++) { 10169 GemmMicrokernelTester() 10170 .mr(4) 10171 .nr(4) 10172 .kr(8) 10173 .sr(1) 10174 .m(4) 10175 .n(4) 10176 .k(k) 10177 .a_stride(11) 10178 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10179 } 10180 } 10181 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_lt_8_subtile)10182 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_lt_8_subtile) { 10183 TEST_REQUIRES_ARM_NEON_BF16; 10184 for (size_t k = 1; k < 8; k++) { 10185 for (uint32_t n = 1; n <= 4; n++) { 10186 for (uint32_t m = 1; m <= 4; m++) { 10187 GemmMicrokernelTester() 10188 .mr(4) 10189 .nr(4) 10190 .kr(8) 10191 .sr(1) 10192 .m(m) 10193 .n(n) 10194 .k(k) 10195 .iterations(1) 10196 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10197 } 10198 } 10199 } 10200 } 10201 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_gt_8)10202 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_gt_8) { 10203 TEST_REQUIRES_ARM_NEON_BF16; 10204 for (size_t k = 9; k < 16; k++) { 10205 GemmMicrokernelTester() 10206 .mr(4) 10207 .nr(4) 10208 .kr(8) 10209 .sr(1) 10210 .m(4) 10211 .n(4) 10212 .k(k) 10213 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10214 } 10215 } 10216 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_gt_8_strided_a)10217 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_gt_8_strided_a) { 10218 TEST_REQUIRES_ARM_NEON_BF16; 10219 for (size_t k = 9; k < 16; k++) { 10220 GemmMicrokernelTester() 10221 .mr(4) 10222 .nr(4) 10223 .kr(8) 10224 .sr(1) 10225 .m(4) 10226 .n(4) 10227 .k(k) 10228 .a_stride(19) 10229 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10230 } 10231 } 10232 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_gt_8_subtile)10233 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_gt_8_subtile) { 10234 TEST_REQUIRES_ARM_NEON_BF16; 10235 for (size_t k = 9; k < 16; k++) { 10236 for (uint32_t n = 1; n <= 4; n++) { 10237 for (uint32_t m = 1; m <= 4; m++) { 10238 GemmMicrokernelTester() 10239 .mr(4) 10240 .nr(4) 10241 .kr(8) 10242 .sr(1) 10243 .m(m) 10244 .n(n) 10245 .k(k) 10246 .iterations(1) 10247 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10248 } 10249 } 10250 } 10251 } 10252 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_div_8)10253 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_div_8) { 10254 TEST_REQUIRES_ARM_NEON_BF16; 10255 for (size_t k = 16; k <= 80; k += 8) { 10256 GemmMicrokernelTester() 10257 .mr(4) 10258 .nr(4) 10259 .kr(8) 10260 .sr(1) 10261 .m(4) 10262 .n(4) 10263 .k(k) 10264 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10265 } 10266 } 10267 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_div_8_strided_a)10268 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_div_8_strided_a) { 10269 TEST_REQUIRES_ARM_NEON_BF16; 10270 for (size_t k = 16; k <= 80; k += 8) { 10271 GemmMicrokernelTester() 10272 .mr(4) 10273 .nr(4) 10274 .kr(8) 10275 .sr(1) 10276 .m(4) 10277 .n(4) 10278 .k(k) 10279 .a_stride(83) 10280 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10281 } 10282 } 10283 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,k_div_8_subtile)10284 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, k_div_8_subtile) { 10285 TEST_REQUIRES_ARM_NEON_BF16; 10286 for (size_t k = 16; k <= 80; k += 8) { 10287 for (uint32_t n = 1; n <= 4; n++) { 10288 for (uint32_t m = 1; m <= 4; m++) { 10289 GemmMicrokernelTester() 10290 .mr(4) 10291 .nr(4) 10292 .kr(8) 10293 .sr(1) 10294 .m(m) 10295 .n(n) 10296 .k(k) 10297 .iterations(1) 10298 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10299 } 10300 } 10301 } 10302 } 10303 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,n_gt_4)10304 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, n_gt_4) { 10305 TEST_REQUIRES_ARM_NEON_BF16; 10306 for (uint32_t n = 5; n < 8; n++) { 10307 for (size_t k = 1; k <= 40; k += 9) { 10308 GemmMicrokernelTester() 10309 .mr(4) 10310 .nr(4) 10311 .kr(8) 10312 .sr(1) 10313 .m(4) 10314 .n(n) 10315 .k(k) 10316 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10317 } 10318 } 10319 } 10320 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,n_gt_4_strided_cn)10321 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, n_gt_4_strided_cn) { 10322 TEST_REQUIRES_ARM_NEON_BF16; 10323 for (uint32_t n = 5; n < 8; n++) { 10324 for (size_t k = 1; k <= 40; k += 9) { 10325 GemmMicrokernelTester() 10326 .mr(4) 10327 .nr(4) 10328 .kr(8) 10329 .sr(1) 10330 .m(4) 10331 .n(n) 10332 .k(k) 10333 .cn_stride(7) 10334 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10335 } 10336 } 10337 } 10338 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,n_gt_4_strided_a)10339 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, n_gt_4_strided_a) { 10340 TEST_REQUIRES_ARM_NEON_BF16; 10341 for (uint32_t n = 5; n < 8; n++) { 10342 for (size_t k = 1; k <= 40; k += 9) { 10343 GemmMicrokernelTester() 10344 .mr(4) 10345 .nr(4) 10346 .kr(8) 10347 .sr(1) 10348 .m(4) 10349 .n(n) 10350 .k(k) 10351 .a_stride(43) 10352 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10353 } 10354 } 10355 } 10356 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,n_gt_4_subtile)10357 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, n_gt_4_subtile) { 10358 TEST_REQUIRES_ARM_NEON_BF16; 10359 for (uint32_t n = 5; n < 8; n++) { 10360 for (size_t k = 1; k <= 40; k += 9) { 10361 for (uint32_t m = 1; m <= 4; m++) { 10362 GemmMicrokernelTester() 10363 .mr(4) 10364 .nr(4) 10365 .kr(8) 10366 .sr(1) 10367 .m(m) 10368 .n(n) 10369 .k(k) 10370 .iterations(1) 10371 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10372 } 10373 } 10374 } 10375 } 10376 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,n_div_4)10377 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, n_div_4) { 10378 TEST_REQUIRES_ARM_NEON_BF16; 10379 for (uint32_t n = 8; n <= 12; n += 4) { 10380 for (size_t k = 1; k <= 40; k += 9) { 10381 GemmMicrokernelTester() 10382 .mr(4) 10383 .nr(4) 10384 .kr(8) 10385 .sr(1) 10386 .m(4) 10387 .n(n) 10388 .k(k) 10389 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10390 } 10391 } 10392 } 10393 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,n_div_4_strided_cn)10394 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, n_div_4_strided_cn) { 10395 TEST_REQUIRES_ARM_NEON_BF16; 10396 for (uint32_t n = 8; n <= 12; n += 4) { 10397 for (size_t k = 1; k <= 40; k += 9) { 10398 GemmMicrokernelTester() 10399 .mr(4) 10400 .nr(4) 10401 .kr(8) 10402 .sr(1) 10403 .m(4) 10404 .n(n) 10405 .k(k) 10406 .cn_stride(7) 10407 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10408 } 10409 } 10410 } 10411 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,n_div_4_strided_a)10412 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, n_div_4_strided_a) { 10413 TEST_REQUIRES_ARM_NEON_BF16; 10414 for (uint32_t n = 8; n <= 12; n += 4) { 10415 for (size_t k = 1; k <= 40; k += 9) { 10416 GemmMicrokernelTester() 10417 .mr(4) 10418 .nr(4) 10419 .kr(8) 10420 .sr(1) 10421 .m(4) 10422 .n(n) 10423 .k(k) 10424 .a_stride(43) 10425 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10426 } 10427 } 10428 } 10429 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,n_div_4_subtile)10430 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, n_div_4_subtile) { 10431 TEST_REQUIRES_ARM_NEON_BF16; 10432 for (uint32_t n = 8; n <= 12; n += 4) { 10433 for (size_t k = 1; k <= 40; k += 9) { 10434 for (uint32_t m = 1; m <= 4; m++) { 10435 GemmMicrokernelTester() 10436 .mr(4) 10437 .nr(4) 10438 .kr(8) 10439 .sr(1) 10440 .m(m) 10441 .n(n) 10442 .k(k) 10443 .iterations(1) 10444 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10445 } 10446 } 10447 } 10448 } 10449 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,strided_cm_subtile)10450 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, strided_cm_subtile) { 10451 TEST_REQUIRES_ARM_NEON_BF16; 10452 for (size_t k = 1; k <= 40; k += 9) { 10453 for (uint32_t n = 1; n <= 4; n++) { 10454 for (uint32_t m = 1; m <= 4; m++) { 10455 GemmMicrokernelTester() 10456 .mr(4) 10457 .nr(4) 10458 .kr(8) 10459 .sr(1) 10460 .m(m) 10461 .n(n) 10462 .k(k) 10463 .cm_stride(7) 10464 .iterations(1) 10465 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10466 } 10467 } 10468 } 10469 } 10470 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,qmin)10471 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, qmin) { 10472 TEST_REQUIRES_ARM_NEON_BF16; 10473 GemmMicrokernelTester() 10474 .mr(4) 10475 .nr(4) 10476 .kr(8) 10477 .sr(1) 10478 .m(4) 10479 .n(4) 10480 .k(8) 10481 .qmin(128) 10482 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10483 } 10484 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,qmax)10485 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, qmax) { 10486 TEST_REQUIRES_ARM_NEON_BF16; 10487 GemmMicrokernelTester() 10488 .mr(4) 10489 .nr(4) 10490 .kr(8) 10491 .sr(1) 10492 .m(4) 10493 .n(4) 10494 .k(8) 10495 .qmax(128) 10496 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10497 } 10498 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL,strided_cm)10499 TEST(BF16_GEMM_MINMAX_4X4C8__NEONBF16_BFMLAL, strided_cm) { 10500 TEST_REQUIRES_ARM_NEON_BF16; 10501 GemmMicrokernelTester() 10502 .mr(4) 10503 .nr(4) 10504 .kr(8) 10505 .sr(1) 10506 .m(4) 10507 .n(4) 10508 .k(8) 10509 .cm_stride(7) 10510 .Test(xnn_bf16_gemm_minmax_ukernel_4x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10511 } 10512 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 10513 10514 10515 #if XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_eq_8)10516 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_eq_8) { 10517 TEST_REQUIRES_ARM_NEON_BF16; 10518 GemmMicrokernelTester() 10519 .mr(5) 10520 .nr(4) 10521 .kr(8) 10522 .sr(1) 10523 .m(5) 10524 .n(4) 10525 .k(8) 10526 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10527 } 10528 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,strided_cn)10529 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, strided_cn) { 10530 TEST_REQUIRES_ARM_NEON_BF16; 10531 GemmMicrokernelTester() 10532 .mr(5) 10533 .nr(4) 10534 .kr(8) 10535 .sr(1) 10536 .m(5) 10537 .n(4) 10538 .k(8) 10539 .cn_stride(7) 10540 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10541 } 10542 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_eq_8_strided_a)10543 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_eq_8_strided_a) { 10544 TEST_REQUIRES_ARM_NEON_BF16; 10545 GemmMicrokernelTester() 10546 .mr(5) 10547 .nr(4) 10548 .kr(8) 10549 .sr(1) 10550 .m(5) 10551 .n(4) 10552 .k(8) 10553 .a_stride(11) 10554 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10555 } 10556 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_eq_8_subtile)10557 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_eq_8_subtile) { 10558 TEST_REQUIRES_ARM_NEON_BF16; 10559 for (uint32_t n = 1; n <= 4; n++) { 10560 for (uint32_t m = 1; m <= 5; m++) { 10561 GemmMicrokernelTester() 10562 .mr(5) 10563 .nr(4) 10564 .kr(8) 10565 .sr(1) 10566 .m(m) 10567 .n(n) 10568 .k(8) 10569 .iterations(1) 10570 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10571 } 10572 } 10573 } 10574 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_m)10575 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_m) { 10576 TEST_REQUIRES_ARM_NEON_BF16; 10577 for (uint32_t m = 1; m <= 5; m++) { 10578 GemmMicrokernelTester() 10579 .mr(5) 10580 .nr(4) 10581 .kr(8) 10582 .sr(1) 10583 .m(m) 10584 .n(4) 10585 .k(8) 10586 .iterations(1) 10587 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10588 } 10589 } 10590 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_eq_8_subtile_n)10591 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_eq_8_subtile_n) { 10592 TEST_REQUIRES_ARM_NEON_BF16; 10593 for (uint32_t n = 1; n <= 4; n++) { 10594 GemmMicrokernelTester() 10595 .mr(5) 10596 .nr(4) 10597 .kr(8) 10598 .sr(1) 10599 .m(5) 10600 .n(n) 10601 .k(8) 10602 .iterations(1) 10603 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10604 } 10605 } 10606 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_lt_8)10607 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_lt_8) { 10608 TEST_REQUIRES_ARM_NEON_BF16; 10609 for (size_t k = 1; k < 8; k++) { 10610 GemmMicrokernelTester() 10611 .mr(5) 10612 .nr(4) 10613 .kr(8) 10614 .sr(1) 10615 .m(5) 10616 .n(4) 10617 .k(k) 10618 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10619 } 10620 } 10621 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_lt_8_strided_a)10622 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_lt_8_strided_a) { 10623 TEST_REQUIRES_ARM_NEON_BF16; 10624 for (size_t k = 1; k < 8; k++) { 10625 GemmMicrokernelTester() 10626 .mr(5) 10627 .nr(4) 10628 .kr(8) 10629 .sr(1) 10630 .m(5) 10631 .n(4) 10632 .k(k) 10633 .a_stride(11) 10634 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10635 } 10636 } 10637 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_lt_8_subtile)10638 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_lt_8_subtile) { 10639 TEST_REQUIRES_ARM_NEON_BF16; 10640 for (size_t k = 1; k < 8; k++) { 10641 for (uint32_t n = 1; n <= 4; n++) { 10642 for (uint32_t m = 1; m <= 5; m++) { 10643 GemmMicrokernelTester() 10644 .mr(5) 10645 .nr(4) 10646 .kr(8) 10647 .sr(1) 10648 .m(m) 10649 .n(n) 10650 .k(k) 10651 .iterations(1) 10652 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10653 } 10654 } 10655 } 10656 } 10657 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_gt_8)10658 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_gt_8) { 10659 TEST_REQUIRES_ARM_NEON_BF16; 10660 for (size_t k = 9; k < 16; k++) { 10661 GemmMicrokernelTester() 10662 .mr(5) 10663 .nr(4) 10664 .kr(8) 10665 .sr(1) 10666 .m(5) 10667 .n(4) 10668 .k(k) 10669 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10670 } 10671 } 10672 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_gt_8_strided_a)10673 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_gt_8_strided_a) { 10674 TEST_REQUIRES_ARM_NEON_BF16; 10675 for (size_t k = 9; k < 16; k++) { 10676 GemmMicrokernelTester() 10677 .mr(5) 10678 .nr(4) 10679 .kr(8) 10680 .sr(1) 10681 .m(5) 10682 .n(4) 10683 .k(k) 10684 .a_stride(19) 10685 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10686 } 10687 } 10688 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_gt_8_subtile)10689 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_gt_8_subtile) { 10690 TEST_REQUIRES_ARM_NEON_BF16; 10691 for (size_t k = 9; k < 16; k++) { 10692 for (uint32_t n = 1; n <= 4; n++) { 10693 for (uint32_t m = 1; m <= 5; m++) { 10694 GemmMicrokernelTester() 10695 .mr(5) 10696 .nr(4) 10697 .kr(8) 10698 .sr(1) 10699 .m(m) 10700 .n(n) 10701 .k(k) 10702 .iterations(1) 10703 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10704 } 10705 } 10706 } 10707 } 10708 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_div_8)10709 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_div_8) { 10710 TEST_REQUIRES_ARM_NEON_BF16; 10711 for (size_t k = 16; k <= 80; k += 8) { 10712 GemmMicrokernelTester() 10713 .mr(5) 10714 .nr(4) 10715 .kr(8) 10716 .sr(1) 10717 .m(5) 10718 .n(4) 10719 .k(k) 10720 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10721 } 10722 } 10723 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_div_8_strided_a)10724 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_div_8_strided_a) { 10725 TEST_REQUIRES_ARM_NEON_BF16; 10726 for (size_t k = 16; k <= 80; k += 8) { 10727 GemmMicrokernelTester() 10728 .mr(5) 10729 .nr(4) 10730 .kr(8) 10731 .sr(1) 10732 .m(5) 10733 .n(4) 10734 .k(k) 10735 .a_stride(83) 10736 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10737 } 10738 } 10739 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,k_div_8_subtile)10740 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, k_div_8_subtile) { 10741 TEST_REQUIRES_ARM_NEON_BF16; 10742 for (size_t k = 16; k <= 80; k += 8) { 10743 for (uint32_t n = 1; n <= 4; n++) { 10744 for (uint32_t m = 1; m <= 5; m++) { 10745 GemmMicrokernelTester() 10746 .mr(5) 10747 .nr(4) 10748 .kr(8) 10749 .sr(1) 10750 .m(m) 10751 .n(n) 10752 .k(k) 10753 .iterations(1) 10754 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10755 } 10756 } 10757 } 10758 } 10759 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,n_gt_4)10760 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, n_gt_4) { 10761 TEST_REQUIRES_ARM_NEON_BF16; 10762 for (uint32_t n = 5; n < 8; n++) { 10763 for (size_t k = 1; k <= 40; k += 9) { 10764 GemmMicrokernelTester() 10765 .mr(5) 10766 .nr(4) 10767 .kr(8) 10768 .sr(1) 10769 .m(5) 10770 .n(n) 10771 .k(k) 10772 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10773 } 10774 } 10775 } 10776 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,n_gt_4_strided_cn)10777 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, n_gt_4_strided_cn) { 10778 TEST_REQUIRES_ARM_NEON_BF16; 10779 for (uint32_t n = 5; n < 8; n++) { 10780 for (size_t k = 1; k <= 40; k += 9) { 10781 GemmMicrokernelTester() 10782 .mr(5) 10783 .nr(4) 10784 .kr(8) 10785 .sr(1) 10786 .m(5) 10787 .n(n) 10788 .k(k) 10789 .cn_stride(7) 10790 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10791 } 10792 } 10793 } 10794 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,n_gt_4_strided_a)10795 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, n_gt_4_strided_a) { 10796 TEST_REQUIRES_ARM_NEON_BF16; 10797 for (uint32_t n = 5; n < 8; n++) { 10798 for (size_t k = 1; k <= 40; k += 9) { 10799 GemmMicrokernelTester() 10800 .mr(5) 10801 .nr(4) 10802 .kr(8) 10803 .sr(1) 10804 .m(5) 10805 .n(n) 10806 .k(k) 10807 .a_stride(43) 10808 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10809 } 10810 } 10811 } 10812 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,n_gt_4_subtile)10813 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, n_gt_4_subtile) { 10814 TEST_REQUIRES_ARM_NEON_BF16; 10815 for (uint32_t n = 5; n < 8; n++) { 10816 for (size_t k = 1; k <= 40; k += 9) { 10817 for (uint32_t m = 1; m <= 5; m++) { 10818 GemmMicrokernelTester() 10819 .mr(5) 10820 .nr(4) 10821 .kr(8) 10822 .sr(1) 10823 .m(m) 10824 .n(n) 10825 .k(k) 10826 .iterations(1) 10827 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10828 } 10829 } 10830 } 10831 } 10832 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,n_div_4)10833 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, n_div_4) { 10834 TEST_REQUIRES_ARM_NEON_BF16; 10835 for (uint32_t n = 8; n <= 12; n += 4) { 10836 for (size_t k = 1; k <= 40; k += 9) { 10837 GemmMicrokernelTester() 10838 .mr(5) 10839 .nr(4) 10840 .kr(8) 10841 .sr(1) 10842 .m(5) 10843 .n(n) 10844 .k(k) 10845 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10846 } 10847 } 10848 } 10849 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,n_div_4_strided_cn)10850 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, n_div_4_strided_cn) { 10851 TEST_REQUIRES_ARM_NEON_BF16; 10852 for (uint32_t n = 8; n <= 12; n += 4) { 10853 for (size_t k = 1; k <= 40; k += 9) { 10854 GemmMicrokernelTester() 10855 .mr(5) 10856 .nr(4) 10857 .kr(8) 10858 .sr(1) 10859 .m(5) 10860 .n(n) 10861 .k(k) 10862 .cn_stride(7) 10863 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10864 } 10865 } 10866 } 10867 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,n_div_4_strided_a)10868 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, n_div_4_strided_a) { 10869 TEST_REQUIRES_ARM_NEON_BF16; 10870 for (uint32_t n = 8; n <= 12; n += 4) { 10871 for (size_t k = 1; k <= 40; k += 9) { 10872 GemmMicrokernelTester() 10873 .mr(5) 10874 .nr(4) 10875 .kr(8) 10876 .sr(1) 10877 .m(5) 10878 .n(n) 10879 .k(k) 10880 .a_stride(43) 10881 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10882 } 10883 } 10884 } 10885 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,n_div_4_subtile)10886 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, n_div_4_subtile) { 10887 TEST_REQUIRES_ARM_NEON_BF16; 10888 for (uint32_t n = 8; n <= 12; n += 4) { 10889 for (size_t k = 1; k <= 40; k += 9) { 10890 for (uint32_t m = 1; m <= 5; m++) { 10891 GemmMicrokernelTester() 10892 .mr(5) 10893 .nr(4) 10894 .kr(8) 10895 .sr(1) 10896 .m(m) 10897 .n(n) 10898 .k(k) 10899 .iterations(1) 10900 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10901 } 10902 } 10903 } 10904 } 10905 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,strided_cm_subtile)10906 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, strided_cm_subtile) { 10907 TEST_REQUIRES_ARM_NEON_BF16; 10908 for (size_t k = 1; k <= 40; k += 9) { 10909 for (uint32_t n = 1; n <= 4; n++) { 10910 for (uint32_t m = 1; m <= 5; m++) { 10911 GemmMicrokernelTester() 10912 .mr(5) 10913 .nr(4) 10914 .kr(8) 10915 .sr(1) 10916 .m(m) 10917 .n(n) 10918 .k(k) 10919 .cm_stride(7) 10920 .iterations(1) 10921 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10922 } 10923 } 10924 } 10925 } 10926 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,qmin)10927 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, qmin) { 10928 TEST_REQUIRES_ARM_NEON_BF16; 10929 GemmMicrokernelTester() 10930 .mr(5) 10931 .nr(4) 10932 .kr(8) 10933 .sr(1) 10934 .m(5) 10935 .n(4) 10936 .k(8) 10937 .qmin(128) 10938 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10939 } 10940 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,qmax)10941 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, qmax) { 10942 TEST_REQUIRES_ARM_NEON_BF16; 10943 GemmMicrokernelTester() 10944 .mr(5) 10945 .nr(4) 10946 .kr(8) 10947 .sr(1) 10948 .m(5) 10949 .n(4) 10950 .k(8) 10951 .qmax(128) 10952 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10953 } 10954 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL,strided_cm)10955 TEST(BF16_GEMM_MINMAX_5X4C8__NEONBF16_BFMLAL, strided_cm) { 10956 TEST_REQUIRES_ARM_NEON_BF16; 10957 GemmMicrokernelTester() 10958 .mr(5) 10959 .nr(4) 10960 .kr(8) 10961 .sr(1) 10962 .m(5) 10963 .n(4) 10964 .k(8) 10965 .cm_stride(7) 10966 .Test(xnn_bf16_gemm_minmax_ukernel_5x4c8__neonbf16_bfmlal, xnn_init_bf16_minmax_scalar_params); 10967 } 10968 #endif // XNN_ENABLE_ARM_BF16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 10969