1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 // 9 // Auto-generated file. Do not edit! 10 // Specification: test/f16-igemm-minmax.yaml 11 // Generator: tools/generate-gemm-test.py 12 13 14 #include <gtest/gtest.h> 15 16 #include <xnnpack/allocator.h> 17 #include <xnnpack/common.h> 18 #include <xnnpack/isa-checks.h> 19 #include <xnnpack/microparams-init.h> 20 21 #include <xnnpack/gemm.h> 22 #include <xnnpack/igemm.h> 23 #include <xnnpack/ppmm.h> 24 #include "gemm-microkernel-tester.h" 25 26 27 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)28 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 29 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 30 GemmMicrokernelTester() 31 .mr(1) 32 .nr(16) 33 .kr(1) 34 .sr(1) 35 .m(1) 36 .n(16) 37 .k(2) 38 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 39 } 40 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)41 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 42 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 43 GemmMicrokernelTester() 44 .mr(1) 45 .nr(16) 46 .kr(1) 47 .sr(1) 48 .m(1) 49 .n(16) 50 .k(2) 51 .cn_stride(19) 52 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 53 } 54 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)55 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 56 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 57 for (uint32_t n = 1; n <= 16; n++) { 58 for (uint32_t m = 1; m <= 1; m++) { 59 GemmMicrokernelTester() 60 .mr(1) 61 .nr(16) 62 .kr(1) 63 .sr(1) 64 .m(m) 65 .n(n) 66 .k(2) 67 .iterations(1) 68 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 69 } 70 } 71 } 72 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)73 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 74 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 75 for (uint32_t m = 1; m <= 1; m++) { 76 GemmMicrokernelTester() 77 .mr(1) 78 .nr(16) 79 .kr(1) 80 .sr(1) 81 .m(m) 82 .n(16) 83 .k(2) 84 .iterations(1) 85 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 86 } 87 } 88 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)89 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 90 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 91 for (uint32_t n = 1; n <= 16; n++) { 92 GemmMicrokernelTester() 93 .mr(1) 94 .nr(16) 95 .kr(1) 96 .sr(1) 97 .m(1) 98 .n(n) 99 .k(2) 100 .iterations(1) 101 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 102 } 103 } 104 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)105 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 106 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 107 for (size_t k = 1; k < 2; k++) { 108 GemmMicrokernelTester() 109 .mr(1) 110 .nr(16) 111 .kr(1) 112 .sr(1) 113 .m(1) 114 .n(16) 115 .k(k) 116 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 117 } 118 } 119 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)120 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 121 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 122 for (size_t k = 1; k < 2; k++) { 123 for (uint32_t n = 1; n <= 16; n++) { 124 for (uint32_t m = 1; m <= 1; m++) { 125 GemmMicrokernelTester() 126 .mr(1) 127 .nr(16) 128 .kr(1) 129 .sr(1) 130 .m(m) 131 .n(n) 132 .k(k) 133 .iterations(1) 134 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 135 } 136 } 137 } 138 } 139 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)140 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 141 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 142 for (size_t k = 3; k < 4; k++) { 143 GemmMicrokernelTester() 144 .mr(1) 145 .nr(16) 146 .kr(1) 147 .sr(1) 148 .m(1) 149 .n(16) 150 .k(k) 151 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 152 } 153 } 154 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)155 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 156 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 157 for (size_t k = 3; k < 4; k++) { 158 for (uint32_t n = 1; n <= 16; n++) { 159 for (uint32_t m = 1; m <= 1; m++) { 160 GemmMicrokernelTester() 161 .mr(1) 162 .nr(16) 163 .kr(1) 164 .sr(1) 165 .m(m) 166 .n(n) 167 .k(k) 168 .iterations(1) 169 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 170 } 171 } 172 } 173 } 174 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)175 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 176 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 177 for (size_t k = 4; k <= 20; k += 2) { 178 GemmMicrokernelTester() 179 .mr(1) 180 .nr(16) 181 .kr(1) 182 .sr(1) 183 .m(1) 184 .n(16) 185 .k(k) 186 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 187 } 188 } 189 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)190 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 191 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 192 for (size_t k = 4; k <= 20; k += 2) { 193 for (uint32_t n = 1; n <= 16; n++) { 194 for (uint32_t m = 1; m <= 1; m++) { 195 GemmMicrokernelTester() 196 .mr(1) 197 .nr(16) 198 .kr(1) 199 .sr(1) 200 .m(m) 201 .n(n) 202 .k(k) 203 .iterations(1) 204 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 205 } 206 } 207 } 208 } 209 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)210 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 211 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 212 for (uint32_t n = 17; n < 32; n++) { 213 for (size_t k = 1; k <= 10; k += 3) { 214 GemmMicrokernelTester() 215 .mr(1) 216 .nr(16) 217 .kr(1) 218 .sr(1) 219 .m(1) 220 .n(n) 221 .k(k) 222 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 223 } 224 } 225 } 226 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)227 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 228 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 229 for (uint32_t n = 17; n < 32; n++) { 230 for (size_t k = 1; k <= 10; k += 3) { 231 GemmMicrokernelTester() 232 .mr(1) 233 .nr(16) 234 .kr(1) 235 .sr(1) 236 .m(1) 237 .n(n) 238 .k(k) 239 .cn_stride(19) 240 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 241 } 242 } 243 } 244 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)245 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 246 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 247 for (uint32_t n = 17; n < 32; n++) { 248 for (size_t k = 1; k <= 10; k += 3) { 249 for (uint32_t m = 1; m <= 1; m++) { 250 GemmMicrokernelTester() 251 .mr(1) 252 .nr(16) 253 .kr(1) 254 .sr(1) 255 .m(m) 256 .n(n) 257 .k(k) 258 .iterations(1) 259 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 260 } 261 } 262 } 263 } 264 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)265 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 266 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 267 for (uint32_t n = 32; n <= 48; n += 16) { 268 for (size_t k = 1; k <= 10; k += 3) { 269 GemmMicrokernelTester() 270 .mr(1) 271 .nr(16) 272 .kr(1) 273 .sr(1) 274 .m(1) 275 .n(n) 276 .k(k) 277 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 278 } 279 } 280 } 281 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)282 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 283 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 284 for (uint32_t n = 32; n <= 48; n += 16) { 285 for (size_t k = 1; k <= 10; k += 3) { 286 GemmMicrokernelTester() 287 .mr(1) 288 .nr(16) 289 .kr(1) 290 .sr(1) 291 .m(1) 292 .n(n) 293 .k(k) 294 .cn_stride(19) 295 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 296 } 297 } 298 } 299 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)300 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 301 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 302 for (uint32_t n = 32; n <= 48; n += 16) { 303 for (size_t k = 1; k <= 10; k += 3) { 304 for (uint32_t m = 1; m <= 1; m++) { 305 GemmMicrokernelTester() 306 .mr(1) 307 .nr(16) 308 .kr(1) 309 .sr(1) 310 .m(m) 311 .n(n) 312 .k(k) 313 .iterations(1) 314 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 315 } 316 } 317 } 318 } 319 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,small_kernel)320 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, small_kernel) { 321 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 322 for (size_t k = 1; k <= 10; k += 3) { 323 GemmMicrokernelTester() 324 .mr(1) 325 .nr(16) 326 .kr(1) 327 .sr(1) 328 .m(1) 329 .n(16) 330 .k(k) 331 .ks(3) 332 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 333 } 334 } 335 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,small_kernel_subtile)336 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, small_kernel_subtile) { 337 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 338 for (size_t k = 1; k <= 10; k += 3) { 339 for (uint32_t n = 1; n <= 16; n++) { 340 for (uint32_t m = 1; m <= 1; m++) { 341 GemmMicrokernelTester() 342 .mr(1) 343 .nr(16) 344 .kr(1) 345 .sr(1) 346 .m(m) 347 .n(n) 348 .k(k) 349 .ks(3) 350 .iterations(1) 351 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 352 } 353 } 354 } 355 } 356 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_small_kernel)357 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_small_kernel) { 358 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 359 for (uint32_t n = 17; n < 32; n++) { 360 for (size_t k = 1; k <= 10; k += 3) { 361 GemmMicrokernelTester() 362 .mr(1) 363 .nr(16) 364 .kr(1) 365 .sr(1) 366 .m(1) 367 .n(n) 368 .k(k) 369 .ks(3) 370 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 371 } 372 } 373 } 374 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_small_kernel)375 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_small_kernel) { 376 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 377 for (uint32_t n = 32; n <= 48; n += 16) { 378 for (size_t k = 1; k <= 10; k += 3) { 379 GemmMicrokernelTester() 380 .mr(1) 381 .nr(16) 382 .kr(1) 383 .sr(1) 384 .m(1) 385 .n(n) 386 .k(k) 387 .ks(3) 388 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 389 } 390 } 391 } 392 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)393 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 394 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 395 for (size_t k = 1; k <= 10; k += 3) { 396 for (uint32_t n = 1; n <= 16; n++) { 397 for (uint32_t m = 1; m <= 1; m++) { 398 GemmMicrokernelTester() 399 .mr(1) 400 .nr(16) 401 .kr(1) 402 .sr(1) 403 .m(m) 404 .n(n) 405 .k(k) 406 .cm_stride(19) 407 .iterations(1) 408 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 409 } 410 } 411 } 412 } 413 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,a_offset)414 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, a_offset) { 415 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 416 for (size_t k = 1; k <= 10; k += 3) { 417 GemmMicrokernelTester() 418 .mr(1) 419 .nr(16) 420 .kr(1) 421 .sr(1) 422 .m(1) 423 .n(16) 424 .k(k) 425 .ks(3) 426 .a_offset(13) 427 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 428 } 429 } 430 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,zero)431 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, zero) { 432 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 433 for (size_t k = 1; k <= 10; k += 3) { 434 for (uint32_t mz = 0; mz < 1; mz++) { 435 GemmMicrokernelTester() 436 .mr(1) 437 .nr(16) 438 .kr(1) 439 .sr(1) 440 .m(1) 441 .n(16) 442 .k(k) 443 .ks(3) 444 .a_offset(13) 445 .zero_index(mz) 446 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 447 } 448 } 449 } 450 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,qmin)451 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 452 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 453 GemmMicrokernelTester() 454 .mr(1) 455 .nr(16) 456 .kr(1) 457 .sr(1) 458 .m(1) 459 .n(16) 460 .k(2) 461 .qmin(128) 462 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 463 } 464 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,qmax)465 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 466 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 467 GemmMicrokernelTester() 468 .mr(1) 469 .nr(16) 470 .kr(1) 471 .sr(1) 472 .m(1) 473 .n(16) 474 .k(2) 475 .qmax(128) 476 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 477 } 478 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)479 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 480 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 481 GemmMicrokernelTester() 482 .mr(1) 483 .nr(16) 484 .kr(1) 485 .sr(1) 486 .m(1) 487 .n(16) 488 .k(2) 489 .cm_stride(19) 490 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 491 } 492 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 493 494 495 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4)496 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 497 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 498 GemmMicrokernelTester() 499 .mr(1) 500 .nr(16) 501 .kr(1) 502 .sr(1) 503 .m(1) 504 .n(16) 505 .k(4) 506 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 507 } 508 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,strided_cn)509 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 510 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 511 GemmMicrokernelTester() 512 .mr(1) 513 .nr(16) 514 .kr(1) 515 .sr(1) 516 .m(1) 517 .n(16) 518 .k(4) 519 .cn_stride(19) 520 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 521 } 522 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)523 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 524 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 525 for (uint32_t n = 1; n <= 16; n++) { 526 for (uint32_t m = 1; m <= 1; m++) { 527 GemmMicrokernelTester() 528 .mr(1) 529 .nr(16) 530 .kr(1) 531 .sr(1) 532 .m(m) 533 .n(n) 534 .k(4) 535 .iterations(1) 536 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 537 } 538 } 539 } 540 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)541 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 542 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 543 for (uint32_t m = 1; m <= 1; m++) { 544 GemmMicrokernelTester() 545 .mr(1) 546 .nr(16) 547 .kr(1) 548 .sr(1) 549 .m(m) 550 .n(16) 551 .k(4) 552 .iterations(1) 553 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 554 } 555 } 556 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)557 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 558 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 559 for (uint32_t n = 1; n <= 16; n++) { 560 GemmMicrokernelTester() 561 .mr(1) 562 .nr(16) 563 .kr(1) 564 .sr(1) 565 .m(1) 566 .n(n) 567 .k(4) 568 .iterations(1) 569 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 570 } 571 } 572 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4)573 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 574 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 575 for (size_t k = 1; k < 4; k++) { 576 GemmMicrokernelTester() 577 .mr(1) 578 .nr(16) 579 .kr(1) 580 .sr(1) 581 .m(1) 582 .n(16) 583 .k(k) 584 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 585 } 586 } 587 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)588 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 589 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 590 for (size_t k = 1; k < 4; k++) { 591 for (uint32_t n = 1; n <= 16; n++) { 592 for (uint32_t m = 1; m <= 1; m++) { 593 GemmMicrokernelTester() 594 .mr(1) 595 .nr(16) 596 .kr(1) 597 .sr(1) 598 .m(m) 599 .n(n) 600 .k(k) 601 .iterations(1) 602 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 603 } 604 } 605 } 606 } 607 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4)608 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 609 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 610 for (size_t k = 5; k < 8; k++) { 611 GemmMicrokernelTester() 612 .mr(1) 613 .nr(16) 614 .kr(1) 615 .sr(1) 616 .m(1) 617 .n(16) 618 .k(k) 619 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 620 } 621 } 622 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)623 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 624 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 625 for (size_t k = 5; k < 8; k++) { 626 for (uint32_t n = 1; n <= 16; n++) { 627 for (uint32_t m = 1; m <= 1; m++) { 628 GemmMicrokernelTester() 629 .mr(1) 630 .nr(16) 631 .kr(1) 632 .sr(1) 633 .m(m) 634 .n(n) 635 .k(k) 636 .iterations(1) 637 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 638 } 639 } 640 } 641 } 642 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_div_4)643 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 644 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 645 for (size_t k = 8; k <= 40; k += 4) { 646 GemmMicrokernelTester() 647 .mr(1) 648 .nr(16) 649 .kr(1) 650 .sr(1) 651 .m(1) 652 .n(16) 653 .k(k) 654 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 655 } 656 } 657 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)658 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 659 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 660 for (size_t k = 8; k <= 40; k += 4) { 661 for (uint32_t n = 1; n <= 16; n++) { 662 for (uint32_t m = 1; m <= 1; m++) { 663 GemmMicrokernelTester() 664 .mr(1) 665 .nr(16) 666 .kr(1) 667 .sr(1) 668 .m(m) 669 .n(n) 670 .k(k) 671 .iterations(1) 672 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 673 } 674 } 675 } 676 } 677 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16)678 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16) { 679 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 680 for (uint32_t n = 17; n < 32; n++) { 681 for (size_t k = 1; k <= 20; k += 5) { 682 GemmMicrokernelTester() 683 .mr(1) 684 .nr(16) 685 .kr(1) 686 .sr(1) 687 .m(1) 688 .n(n) 689 .k(k) 690 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 691 } 692 } 693 } 694 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_cn)695 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 696 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 697 for (uint32_t n = 17; n < 32; n++) { 698 for (size_t k = 1; k <= 20; k += 5) { 699 GemmMicrokernelTester() 700 .mr(1) 701 .nr(16) 702 .kr(1) 703 .sr(1) 704 .m(1) 705 .n(n) 706 .k(k) 707 .cn_stride(19) 708 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 709 } 710 } 711 } 712 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_subtile)713 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_subtile) { 714 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 715 for (uint32_t n = 17; n < 32; n++) { 716 for (size_t k = 1; k <= 20; k += 5) { 717 for (uint32_t m = 1; m <= 1; m++) { 718 GemmMicrokernelTester() 719 .mr(1) 720 .nr(16) 721 .kr(1) 722 .sr(1) 723 .m(m) 724 .n(n) 725 .k(k) 726 .iterations(1) 727 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 728 } 729 } 730 } 731 } 732 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_div_16)733 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_div_16) { 734 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 735 for (uint32_t n = 32; n <= 48; n += 16) { 736 for (size_t k = 1; k <= 20; k += 5) { 737 GemmMicrokernelTester() 738 .mr(1) 739 .nr(16) 740 .kr(1) 741 .sr(1) 742 .m(1) 743 .n(n) 744 .k(k) 745 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 746 } 747 } 748 } 749 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_cn)750 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_cn) { 751 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 752 for (uint32_t n = 32; n <= 48; n += 16) { 753 for (size_t k = 1; k <= 20; k += 5) { 754 GemmMicrokernelTester() 755 .mr(1) 756 .nr(16) 757 .kr(1) 758 .sr(1) 759 .m(1) 760 .n(n) 761 .k(k) 762 .cn_stride(19) 763 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 764 } 765 } 766 } 767 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_subtile)768 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_subtile) { 769 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 770 for (uint32_t n = 32; n <= 48; n += 16) { 771 for (size_t k = 1; k <= 20; k += 5) { 772 for (uint32_t m = 1; m <= 1; m++) { 773 GemmMicrokernelTester() 774 .mr(1) 775 .nr(16) 776 .kr(1) 777 .sr(1) 778 .m(m) 779 .n(n) 780 .k(k) 781 .iterations(1) 782 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 783 } 784 } 785 } 786 } 787 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,small_kernel)788 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, small_kernel) { 789 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 790 for (size_t k = 1; k <= 20; k += 5) { 791 GemmMicrokernelTester() 792 .mr(1) 793 .nr(16) 794 .kr(1) 795 .sr(1) 796 .m(1) 797 .n(16) 798 .k(k) 799 .ks(3) 800 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 801 } 802 } 803 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,small_kernel_subtile)804 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, small_kernel_subtile) { 805 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 806 for (size_t k = 1; k <= 20; k += 5) { 807 for (uint32_t n = 1; n <= 16; n++) { 808 for (uint32_t m = 1; m <= 1; m++) { 809 GemmMicrokernelTester() 810 .mr(1) 811 .nr(16) 812 .kr(1) 813 .sr(1) 814 .m(m) 815 .n(n) 816 .k(k) 817 .ks(3) 818 .iterations(1) 819 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 820 } 821 } 822 } 823 } 824 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_small_kernel)825 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_small_kernel) { 826 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 827 for (uint32_t n = 17; n < 32; n++) { 828 for (size_t k = 1; k <= 20; k += 5) { 829 GemmMicrokernelTester() 830 .mr(1) 831 .nr(16) 832 .kr(1) 833 .sr(1) 834 .m(1) 835 .n(n) 836 .k(k) 837 .ks(3) 838 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 839 } 840 } 841 } 842 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_small_kernel)843 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_small_kernel) { 844 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 845 for (uint32_t n = 32; n <= 48; n += 16) { 846 for (size_t k = 1; k <= 20; k += 5) { 847 GemmMicrokernelTester() 848 .mr(1) 849 .nr(16) 850 .kr(1) 851 .sr(1) 852 .m(1) 853 .n(n) 854 .k(k) 855 .ks(3) 856 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 857 } 858 } 859 } 860 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)861 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 862 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 863 for (size_t k = 1; k <= 20; k += 5) { 864 for (uint32_t n = 1; n <= 16; n++) { 865 for (uint32_t m = 1; m <= 1; m++) { 866 GemmMicrokernelTester() 867 .mr(1) 868 .nr(16) 869 .kr(1) 870 .sr(1) 871 .m(m) 872 .n(n) 873 .k(k) 874 .cm_stride(19) 875 .iterations(1) 876 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 877 } 878 } 879 } 880 } 881 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,a_offset)882 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, a_offset) { 883 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 884 for (size_t k = 1; k <= 20; k += 5) { 885 GemmMicrokernelTester() 886 .mr(1) 887 .nr(16) 888 .kr(1) 889 .sr(1) 890 .m(1) 891 .n(16) 892 .k(k) 893 .ks(3) 894 .a_offset(23) 895 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 896 } 897 } 898 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,zero)899 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, zero) { 900 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 901 for (size_t k = 1; k <= 20; k += 5) { 902 for (uint32_t mz = 0; mz < 1; mz++) { 903 GemmMicrokernelTester() 904 .mr(1) 905 .nr(16) 906 .kr(1) 907 .sr(1) 908 .m(1) 909 .n(16) 910 .k(k) 911 .ks(3) 912 .a_offset(23) 913 .zero_index(mz) 914 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 915 } 916 } 917 } 918 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,qmin)919 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, qmin) { 920 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 921 GemmMicrokernelTester() 922 .mr(1) 923 .nr(16) 924 .kr(1) 925 .sr(1) 926 .m(1) 927 .n(16) 928 .k(4) 929 .qmin(128) 930 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 931 } 932 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,qmax)933 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, qmax) { 934 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 935 GemmMicrokernelTester() 936 .mr(1) 937 .nr(16) 938 .kr(1) 939 .sr(1) 940 .m(1) 941 .n(16) 942 .k(4) 943 .qmax(128) 944 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 945 } 946 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64,strided_cm)947 TEST(F16_IGEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 948 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 949 GemmMicrokernelTester() 950 .mr(1) 951 .nr(16) 952 .kr(1) 953 .sr(1) 954 .m(1) 955 .n(16) 956 .k(4) 957 .cm_stride(19) 958 .Test(xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 959 } 960 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 961 962 963 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)964 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 965 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 966 GemmMicrokernelTester() 967 .mr(4) 968 .nr(16) 969 .kr(1) 970 .sr(1) 971 .m(4) 972 .n(16) 973 .k(2) 974 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 975 } 976 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)977 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 978 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 979 GemmMicrokernelTester() 980 .mr(4) 981 .nr(16) 982 .kr(1) 983 .sr(1) 984 .m(4) 985 .n(16) 986 .k(2) 987 .cn_stride(19) 988 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 989 } 990 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)991 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 992 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 993 for (uint32_t n = 1; n <= 16; n++) { 994 for (uint32_t m = 1; m <= 4; m++) { 995 GemmMicrokernelTester() 996 .mr(4) 997 .nr(16) 998 .kr(1) 999 .sr(1) 1000 .m(m) 1001 .n(n) 1002 .k(2) 1003 .iterations(1) 1004 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1005 } 1006 } 1007 } 1008 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)1009 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 1010 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1011 for (uint32_t m = 1; m <= 4; m++) { 1012 GemmMicrokernelTester() 1013 .mr(4) 1014 .nr(16) 1015 .kr(1) 1016 .sr(1) 1017 .m(m) 1018 .n(16) 1019 .k(2) 1020 .iterations(1) 1021 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1022 } 1023 } 1024 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)1025 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 1026 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1027 for (uint32_t n = 1; n <= 16; n++) { 1028 GemmMicrokernelTester() 1029 .mr(4) 1030 .nr(16) 1031 .kr(1) 1032 .sr(1) 1033 .m(4) 1034 .n(n) 1035 .k(2) 1036 .iterations(1) 1037 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1038 } 1039 } 1040 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)1041 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 1042 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1043 for (size_t k = 1; k < 2; k++) { 1044 GemmMicrokernelTester() 1045 .mr(4) 1046 .nr(16) 1047 .kr(1) 1048 .sr(1) 1049 .m(4) 1050 .n(16) 1051 .k(k) 1052 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1053 } 1054 } 1055 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)1056 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 1057 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1058 for (size_t k = 1; k < 2; k++) { 1059 for (uint32_t n = 1; n <= 16; n++) { 1060 for (uint32_t m = 1; m <= 4; m++) { 1061 GemmMicrokernelTester() 1062 .mr(4) 1063 .nr(16) 1064 .kr(1) 1065 .sr(1) 1066 .m(m) 1067 .n(n) 1068 .k(k) 1069 .iterations(1) 1070 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1071 } 1072 } 1073 } 1074 } 1075 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)1076 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 1077 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1078 for (size_t k = 3; k < 4; k++) { 1079 GemmMicrokernelTester() 1080 .mr(4) 1081 .nr(16) 1082 .kr(1) 1083 .sr(1) 1084 .m(4) 1085 .n(16) 1086 .k(k) 1087 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1088 } 1089 } 1090 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)1091 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 1092 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1093 for (size_t k = 3; k < 4; k++) { 1094 for (uint32_t n = 1; n <= 16; n++) { 1095 for (uint32_t m = 1; m <= 4; m++) { 1096 GemmMicrokernelTester() 1097 .mr(4) 1098 .nr(16) 1099 .kr(1) 1100 .sr(1) 1101 .m(m) 1102 .n(n) 1103 .k(k) 1104 .iterations(1) 1105 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1106 } 1107 } 1108 } 1109 } 1110 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)1111 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 1112 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1113 for (size_t k = 4; k <= 20; k += 2) { 1114 GemmMicrokernelTester() 1115 .mr(4) 1116 .nr(16) 1117 .kr(1) 1118 .sr(1) 1119 .m(4) 1120 .n(16) 1121 .k(k) 1122 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1123 } 1124 } 1125 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)1126 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 1127 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1128 for (size_t k = 4; k <= 20; k += 2) { 1129 for (uint32_t n = 1; n <= 16; n++) { 1130 for (uint32_t m = 1; m <= 4; m++) { 1131 GemmMicrokernelTester() 1132 .mr(4) 1133 .nr(16) 1134 .kr(1) 1135 .sr(1) 1136 .m(m) 1137 .n(n) 1138 .k(k) 1139 .iterations(1) 1140 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1141 } 1142 } 1143 } 1144 } 1145 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)1146 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 1147 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1148 for (uint32_t n = 17; n < 32; n++) { 1149 for (size_t k = 1; k <= 10; k += 3) { 1150 GemmMicrokernelTester() 1151 .mr(4) 1152 .nr(16) 1153 .kr(1) 1154 .sr(1) 1155 .m(4) 1156 .n(n) 1157 .k(k) 1158 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1159 } 1160 } 1161 } 1162 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)1163 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 1164 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1165 for (uint32_t n = 17; n < 32; n++) { 1166 for (size_t k = 1; k <= 10; k += 3) { 1167 GemmMicrokernelTester() 1168 .mr(4) 1169 .nr(16) 1170 .kr(1) 1171 .sr(1) 1172 .m(4) 1173 .n(n) 1174 .k(k) 1175 .cn_stride(19) 1176 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1177 } 1178 } 1179 } 1180 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)1181 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 1182 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1183 for (uint32_t n = 17; n < 32; n++) { 1184 for (size_t k = 1; k <= 10; k += 3) { 1185 for (uint32_t m = 1; m <= 4; m++) { 1186 GemmMicrokernelTester() 1187 .mr(4) 1188 .nr(16) 1189 .kr(1) 1190 .sr(1) 1191 .m(m) 1192 .n(n) 1193 .k(k) 1194 .iterations(1) 1195 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1196 } 1197 } 1198 } 1199 } 1200 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)1201 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 1202 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1203 for (uint32_t n = 32; n <= 48; n += 16) { 1204 for (size_t k = 1; k <= 10; k += 3) { 1205 GemmMicrokernelTester() 1206 .mr(4) 1207 .nr(16) 1208 .kr(1) 1209 .sr(1) 1210 .m(4) 1211 .n(n) 1212 .k(k) 1213 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1214 } 1215 } 1216 } 1217 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)1218 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 1219 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1220 for (uint32_t n = 32; n <= 48; n += 16) { 1221 for (size_t k = 1; k <= 10; k += 3) { 1222 GemmMicrokernelTester() 1223 .mr(4) 1224 .nr(16) 1225 .kr(1) 1226 .sr(1) 1227 .m(4) 1228 .n(n) 1229 .k(k) 1230 .cn_stride(19) 1231 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1232 } 1233 } 1234 } 1235 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)1236 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 1237 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1238 for (uint32_t n = 32; n <= 48; n += 16) { 1239 for (size_t k = 1; k <= 10; k += 3) { 1240 for (uint32_t m = 1; m <= 4; m++) { 1241 GemmMicrokernelTester() 1242 .mr(4) 1243 .nr(16) 1244 .kr(1) 1245 .sr(1) 1246 .m(m) 1247 .n(n) 1248 .k(k) 1249 .iterations(1) 1250 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1251 } 1252 } 1253 } 1254 } 1255 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,small_kernel)1256 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, small_kernel) { 1257 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1258 for (size_t k = 1; k <= 10; k += 3) { 1259 GemmMicrokernelTester() 1260 .mr(4) 1261 .nr(16) 1262 .kr(1) 1263 .sr(1) 1264 .m(4) 1265 .n(16) 1266 .k(k) 1267 .ks(3) 1268 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1269 } 1270 } 1271 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,small_kernel_subtile)1272 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, small_kernel_subtile) { 1273 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1274 for (size_t k = 1; k <= 10; k += 3) { 1275 for (uint32_t n = 1; n <= 16; n++) { 1276 for (uint32_t m = 1; m <= 4; m++) { 1277 GemmMicrokernelTester() 1278 .mr(4) 1279 .nr(16) 1280 .kr(1) 1281 .sr(1) 1282 .m(m) 1283 .n(n) 1284 .k(k) 1285 .ks(3) 1286 .iterations(1) 1287 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1288 } 1289 } 1290 } 1291 } 1292 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_small_kernel)1293 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_small_kernel) { 1294 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1295 for (uint32_t n = 17; n < 32; n++) { 1296 for (size_t k = 1; k <= 10; k += 3) { 1297 GemmMicrokernelTester() 1298 .mr(4) 1299 .nr(16) 1300 .kr(1) 1301 .sr(1) 1302 .m(4) 1303 .n(n) 1304 .k(k) 1305 .ks(3) 1306 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1307 } 1308 } 1309 } 1310 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_small_kernel)1311 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_small_kernel) { 1312 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1313 for (uint32_t n = 32; n <= 48; n += 16) { 1314 for (size_t k = 1; k <= 10; k += 3) { 1315 GemmMicrokernelTester() 1316 .mr(4) 1317 .nr(16) 1318 .kr(1) 1319 .sr(1) 1320 .m(4) 1321 .n(n) 1322 .k(k) 1323 .ks(3) 1324 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1325 } 1326 } 1327 } 1328 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)1329 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 1330 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1331 for (size_t k = 1; k <= 10; k += 3) { 1332 for (uint32_t n = 1; n <= 16; n++) { 1333 for (uint32_t m = 1; m <= 4; m++) { 1334 GemmMicrokernelTester() 1335 .mr(4) 1336 .nr(16) 1337 .kr(1) 1338 .sr(1) 1339 .m(m) 1340 .n(n) 1341 .k(k) 1342 .cm_stride(19) 1343 .iterations(1) 1344 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1345 } 1346 } 1347 } 1348 } 1349 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,a_offset)1350 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, a_offset) { 1351 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1352 for (size_t k = 1; k <= 10; k += 3) { 1353 GemmMicrokernelTester() 1354 .mr(4) 1355 .nr(16) 1356 .kr(1) 1357 .sr(1) 1358 .m(4) 1359 .n(16) 1360 .k(k) 1361 .ks(3) 1362 .a_offset(43) 1363 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1364 } 1365 } 1366 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,zero)1367 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, zero) { 1368 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1369 for (size_t k = 1; k <= 10; k += 3) { 1370 for (uint32_t mz = 0; mz < 4; mz++) { 1371 GemmMicrokernelTester() 1372 .mr(4) 1373 .nr(16) 1374 .kr(1) 1375 .sr(1) 1376 .m(4) 1377 .n(16) 1378 .k(k) 1379 .ks(3) 1380 .a_offset(43) 1381 .zero_index(mz) 1382 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1383 } 1384 } 1385 } 1386 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,qmin)1387 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 1388 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1389 GemmMicrokernelTester() 1390 .mr(4) 1391 .nr(16) 1392 .kr(1) 1393 .sr(1) 1394 .m(4) 1395 .n(16) 1396 .k(2) 1397 .qmin(128) 1398 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1399 } 1400 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,qmax)1401 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 1402 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1403 GemmMicrokernelTester() 1404 .mr(4) 1405 .nr(16) 1406 .kr(1) 1407 .sr(1) 1408 .m(4) 1409 .n(16) 1410 .k(2) 1411 .qmax(128) 1412 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1413 } 1414 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)1415 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 1416 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1417 GemmMicrokernelTester() 1418 .mr(4) 1419 .nr(16) 1420 .kr(1) 1421 .sr(1) 1422 .m(4) 1423 .n(16) 1424 .k(2) 1425 .cm_stride(19) 1426 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 1427 } 1428 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 1429 1430 1431 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4)1432 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 1433 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1434 GemmMicrokernelTester() 1435 .mr(4) 1436 .nr(16) 1437 .kr(1) 1438 .sr(1) 1439 .m(4) 1440 .n(16) 1441 .k(4) 1442 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1443 } 1444 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,strided_cn)1445 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 1446 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1447 GemmMicrokernelTester() 1448 .mr(4) 1449 .nr(16) 1450 .kr(1) 1451 .sr(1) 1452 .m(4) 1453 .n(16) 1454 .k(4) 1455 .cn_stride(19) 1456 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1457 } 1458 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)1459 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 1460 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1461 for (uint32_t n = 1; n <= 16; n++) { 1462 for (uint32_t m = 1; m <= 4; m++) { 1463 GemmMicrokernelTester() 1464 .mr(4) 1465 .nr(16) 1466 .kr(1) 1467 .sr(1) 1468 .m(m) 1469 .n(n) 1470 .k(4) 1471 .iterations(1) 1472 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1473 } 1474 } 1475 } 1476 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)1477 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 1478 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1479 for (uint32_t m = 1; m <= 4; m++) { 1480 GemmMicrokernelTester() 1481 .mr(4) 1482 .nr(16) 1483 .kr(1) 1484 .sr(1) 1485 .m(m) 1486 .n(16) 1487 .k(4) 1488 .iterations(1) 1489 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1490 } 1491 } 1492 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)1493 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 1494 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1495 for (uint32_t n = 1; n <= 16; n++) { 1496 GemmMicrokernelTester() 1497 .mr(4) 1498 .nr(16) 1499 .kr(1) 1500 .sr(1) 1501 .m(4) 1502 .n(n) 1503 .k(4) 1504 .iterations(1) 1505 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1506 } 1507 } 1508 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4)1509 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 1510 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1511 for (size_t k = 1; k < 4; k++) { 1512 GemmMicrokernelTester() 1513 .mr(4) 1514 .nr(16) 1515 .kr(1) 1516 .sr(1) 1517 .m(4) 1518 .n(16) 1519 .k(k) 1520 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1521 } 1522 } 1523 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)1524 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 1525 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1526 for (size_t k = 1; k < 4; k++) { 1527 for (uint32_t n = 1; n <= 16; n++) { 1528 for (uint32_t m = 1; m <= 4; m++) { 1529 GemmMicrokernelTester() 1530 .mr(4) 1531 .nr(16) 1532 .kr(1) 1533 .sr(1) 1534 .m(m) 1535 .n(n) 1536 .k(k) 1537 .iterations(1) 1538 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1539 } 1540 } 1541 } 1542 } 1543 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4)1544 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 1545 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1546 for (size_t k = 5; k < 8; k++) { 1547 GemmMicrokernelTester() 1548 .mr(4) 1549 .nr(16) 1550 .kr(1) 1551 .sr(1) 1552 .m(4) 1553 .n(16) 1554 .k(k) 1555 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1556 } 1557 } 1558 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)1559 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 1560 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1561 for (size_t k = 5; k < 8; k++) { 1562 for (uint32_t n = 1; n <= 16; n++) { 1563 for (uint32_t m = 1; m <= 4; m++) { 1564 GemmMicrokernelTester() 1565 .mr(4) 1566 .nr(16) 1567 .kr(1) 1568 .sr(1) 1569 .m(m) 1570 .n(n) 1571 .k(k) 1572 .iterations(1) 1573 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1574 } 1575 } 1576 } 1577 } 1578 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_div_4)1579 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 1580 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1581 for (size_t k = 8; k <= 40; k += 4) { 1582 GemmMicrokernelTester() 1583 .mr(4) 1584 .nr(16) 1585 .kr(1) 1586 .sr(1) 1587 .m(4) 1588 .n(16) 1589 .k(k) 1590 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1591 } 1592 } 1593 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)1594 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 1595 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1596 for (size_t k = 8; k <= 40; k += 4) { 1597 for (uint32_t n = 1; n <= 16; n++) { 1598 for (uint32_t m = 1; m <= 4; m++) { 1599 GemmMicrokernelTester() 1600 .mr(4) 1601 .nr(16) 1602 .kr(1) 1603 .sr(1) 1604 .m(m) 1605 .n(n) 1606 .k(k) 1607 .iterations(1) 1608 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1609 } 1610 } 1611 } 1612 } 1613 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16)1614 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16) { 1615 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1616 for (uint32_t n = 17; n < 32; n++) { 1617 for (size_t k = 1; k <= 20; k += 5) { 1618 GemmMicrokernelTester() 1619 .mr(4) 1620 .nr(16) 1621 .kr(1) 1622 .sr(1) 1623 .m(4) 1624 .n(n) 1625 .k(k) 1626 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1627 } 1628 } 1629 } 1630 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_cn)1631 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 1632 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1633 for (uint32_t n = 17; n < 32; n++) { 1634 for (size_t k = 1; k <= 20; k += 5) { 1635 GemmMicrokernelTester() 1636 .mr(4) 1637 .nr(16) 1638 .kr(1) 1639 .sr(1) 1640 .m(4) 1641 .n(n) 1642 .k(k) 1643 .cn_stride(19) 1644 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1645 } 1646 } 1647 } 1648 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_subtile)1649 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_subtile) { 1650 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1651 for (uint32_t n = 17; n < 32; n++) { 1652 for (size_t k = 1; k <= 20; k += 5) { 1653 for (uint32_t m = 1; m <= 4; m++) { 1654 GemmMicrokernelTester() 1655 .mr(4) 1656 .nr(16) 1657 .kr(1) 1658 .sr(1) 1659 .m(m) 1660 .n(n) 1661 .k(k) 1662 .iterations(1) 1663 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1664 } 1665 } 1666 } 1667 } 1668 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_div_16)1669 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_div_16) { 1670 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1671 for (uint32_t n = 32; n <= 48; n += 16) { 1672 for (size_t k = 1; k <= 20; k += 5) { 1673 GemmMicrokernelTester() 1674 .mr(4) 1675 .nr(16) 1676 .kr(1) 1677 .sr(1) 1678 .m(4) 1679 .n(n) 1680 .k(k) 1681 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1682 } 1683 } 1684 } 1685 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_cn)1686 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_cn) { 1687 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1688 for (uint32_t n = 32; n <= 48; n += 16) { 1689 for (size_t k = 1; k <= 20; k += 5) { 1690 GemmMicrokernelTester() 1691 .mr(4) 1692 .nr(16) 1693 .kr(1) 1694 .sr(1) 1695 .m(4) 1696 .n(n) 1697 .k(k) 1698 .cn_stride(19) 1699 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1700 } 1701 } 1702 } 1703 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_subtile)1704 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_subtile) { 1705 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1706 for (uint32_t n = 32; n <= 48; n += 16) { 1707 for (size_t k = 1; k <= 20; k += 5) { 1708 for (uint32_t m = 1; m <= 4; m++) { 1709 GemmMicrokernelTester() 1710 .mr(4) 1711 .nr(16) 1712 .kr(1) 1713 .sr(1) 1714 .m(m) 1715 .n(n) 1716 .k(k) 1717 .iterations(1) 1718 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1719 } 1720 } 1721 } 1722 } 1723 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,small_kernel)1724 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, small_kernel) { 1725 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1726 for (size_t k = 1; k <= 20; k += 5) { 1727 GemmMicrokernelTester() 1728 .mr(4) 1729 .nr(16) 1730 .kr(1) 1731 .sr(1) 1732 .m(4) 1733 .n(16) 1734 .k(k) 1735 .ks(3) 1736 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1737 } 1738 } 1739 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,small_kernel_subtile)1740 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, small_kernel_subtile) { 1741 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1742 for (size_t k = 1; k <= 20; k += 5) { 1743 for (uint32_t n = 1; n <= 16; n++) { 1744 for (uint32_t m = 1; m <= 4; m++) { 1745 GemmMicrokernelTester() 1746 .mr(4) 1747 .nr(16) 1748 .kr(1) 1749 .sr(1) 1750 .m(m) 1751 .n(n) 1752 .k(k) 1753 .ks(3) 1754 .iterations(1) 1755 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1756 } 1757 } 1758 } 1759 } 1760 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_small_kernel)1761 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_small_kernel) { 1762 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1763 for (uint32_t n = 17; n < 32; n++) { 1764 for (size_t k = 1; k <= 20; k += 5) { 1765 GemmMicrokernelTester() 1766 .mr(4) 1767 .nr(16) 1768 .kr(1) 1769 .sr(1) 1770 .m(4) 1771 .n(n) 1772 .k(k) 1773 .ks(3) 1774 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1775 } 1776 } 1777 } 1778 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_small_kernel)1779 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_small_kernel) { 1780 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1781 for (uint32_t n = 32; n <= 48; n += 16) { 1782 for (size_t k = 1; k <= 20; k += 5) { 1783 GemmMicrokernelTester() 1784 .mr(4) 1785 .nr(16) 1786 .kr(1) 1787 .sr(1) 1788 .m(4) 1789 .n(n) 1790 .k(k) 1791 .ks(3) 1792 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1793 } 1794 } 1795 } 1796 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)1797 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 1798 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1799 for (size_t k = 1; k <= 20; k += 5) { 1800 for (uint32_t n = 1; n <= 16; n++) { 1801 for (uint32_t m = 1; m <= 4; m++) { 1802 GemmMicrokernelTester() 1803 .mr(4) 1804 .nr(16) 1805 .kr(1) 1806 .sr(1) 1807 .m(m) 1808 .n(n) 1809 .k(k) 1810 .cm_stride(19) 1811 .iterations(1) 1812 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1813 } 1814 } 1815 } 1816 } 1817 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,a_offset)1818 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, a_offset) { 1819 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1820 for (size_t k = 1; k <= 20; k += 5) { 1821 GemmMicrokernelTester() 1822 .mr(4) 1823 .nr(16) 1824 .kr(1) 1825 .sr(1) 1826 .m(4) 1827 .n(16) 1828 .k(k) 1829 .ks(3) 1830 .a_offset(83) 1831 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1832 } 1833 } 1834 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,zero)1835 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, zero) { 1836 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1837 for (size_t k = 1; k <= 20; k += 5) { 1838 for (uint32_t mz = 0; mz < 4; mz++) { 1839 GemmMicrokernelTester() 1840 .mr(4) 1841 .nr(16) 1842 .kr(1) 1843 .sr(1) 1844 .m(4) 1845 .n(16) 1846 .k(k) 1847 .ks(3) 1848 .a_offset(83) 1849 .zero_index(mz) 1850 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1851 } 1852 } 1853 } 1854 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,qmin)1855 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, qmin) { 1856 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1857 GemmMicrokernelTester() 1858 .mr(4) 1859 .nr(16) 1860 .kr(1) 1861 .sr(1) 1862 .m(4) 1863 .n(16) 1864 .k(4) 1865 .qmin(128) 1866 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1867 } 1868 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,qmax)1869 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, qmax) { 1870 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1871 GemmMicrokernelTester() 1872 .mr(4) 1873 .nr(16) 1874 .kr(1) 1875 .sr(1) 1876 .m(4) 1877 .n(16) 1878 .k(4) 1879 .qmax(128) 1880 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1881 } 1882 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64,strided_cm)1883 TEST(F16_IGEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 1884 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1885 GemmMicrokernelTester() 1886 .mr(4) 1887 .nr(16) 1888 .kr(1) 1889 .sr(1) 1890 .m(4) 1891 .n(16) 1892 .k(4) 1893 .cm_stride(19) 1894 .Test(xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 1895 } 1896 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 1897 1898 1899 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_2)1900 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_2) { 1901 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1902 GemmMicrokernelTester() 1903 .mr(6) 1904 .nr(16) 1905 .kr(1) 1906 .sr(1) 1907 .m(6) 1908 .n(16) 1909 .k(2) 1910 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 1911 } 1912 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,strided_cn)1913 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, strided_cn) { 1914 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1915 GemmMicrokernelTester() 1916 .mr(6) 1917 .nr(16) 1918 .kr(1) 1919 .sr(1) 1920 .m(6) 1921 .n(16) 1922 .k(2) 1923 .cn_stride(19) 1924 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 1925 } 1926 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_2_subtile)1927 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_2_subtile) { 1928 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1929 for (uint32_t n = 1; n <= 16; n++) { 1930 for (uint32_t m = 1; m <= 6; m++) { 1931 GemmMicrokernelTester() 1932 .mr(6) 1933 .nr(16) 1934 .kr(1) 1935 .sr(1) 1936 .m(m) 1937 .n(n) 1938 .k(2) 1939 .iterations(1) 1940 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 1941 } 1942 } 1943 } 1944 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_2_subtile_m)1945 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_2_subtile_m) { 1946 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1947 for (uint32_t m = 1; m <= 6; m++) { 1948 GemmMicrokernelTester() 1949 .mr(6) 1950 .nr(16) 1951 .kr(1) 1952 .sr(1) 1953 .m(m) 1954 .n(16) 1955 .k(2) 1956 .iterations(1) 1957 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 1958 } 1959 } 1960 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_eq_2_subtile_n)1961 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_eq_2_subtile_n) { 1962 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1963 for (uint32_t n = 1; n <= 16; n++) { 1964 GemmMicrokernelTester() 1965 .mr(6) 1966 .nr(16) 1967 .kr(1) 1968 .sr(1) 1969 .m(6) 1970 .n(n) 1971 .k(2) 1972 .iterations(1) 1973 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 1974 } 1975 } 1976 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_lt_2)1977 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_lt_2) { 1978 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1979 for (size_t k = 1; k < 2; k++) { 1980 GemmMicrokernelTester() 1981 .mr(6) 1982 .nr(16) 1983 .kr(1) 1984 .sr(1) 1985 .m(6) 1986 .n(16) 1987 .k(k) 1988 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 1989 } 1990 } 1991 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_lt_2_subtile)1992 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_lt_2_subtile) { 1993 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 1994 for (size_t k = 1; k < 2; k++) { 1995 for (uint32_t n = 1; n <= 16; n++) { 1996 for (uint32_t m = 1; m <= 6; m++) { 1997 GemmMicrokernelTester() 1998 .mr(6) 1999 .nr(16) 2000 .kr(1) 2001 .sr(1) 2002 .m(m) 2003 .n(n) 2004 .k(k) 2005 .iterations(1) 2006 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2007 } 2008 } 2009 } 2010 } 2011 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_gt_2)2012 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_gt_2) { 2013 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2014 for (size_t k = 3; k < 4; k++) { 2015 GemmMicrokernelTester() 2016 .mr(6) 2017 .nr(16) 2018 .kr(1) 2019 .sr(1) 2020 .m(6) 2021 .n(16) 2022 .k(k) 2023 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2024 } 2025 } 2026 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_gt_2_subtile)2027 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_gt_2_subtile) { 2028 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2029 for (size_t k = 3; k < 4; k++) { 2030 for (uint32_t n = 1; n <= 16; n++) { 2031 for (uint32_t m = 1; m <= 6; m++) { 2032 GemmMicrokernelTester() 2033 .mr(6) 2034 .nr(16) 2035 .kr(1) 2036 .sr(1) 2037 .m(m) 2038 .n(n) 2039 .k(k) 2040 .iterations(1) 2041 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2042 } 2043 } 2044 } 2045 } 2046 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_div_2)2047 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_div_2) { 2048 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2049 for (size_t k = 4; k <= 20; k += 2) { 2050 GemmMicrokernelTester() 2051 .mr(6) 2052 .nr(16) 2053 .kr(1) 2054 .sr(1) 2055 .m(6) 2056 .n(16) 2057 .k(k) 2058 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2059 } 2060 } 2061 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,k_div_2_subtile)2062 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, k_div_2_subtile) { 2063 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2064 for (size_t k = 4; k <= 20; k += 2) { 2065 for (uint32_t n = 1; n <= 16; n++) { 2066 for (uint32_t m = 1; m <= 6; m++) { 2067 GemmMicrokernelTester() 2068 .mr(6) 2069 .nr(16) 2070 .kr(1) 2071 .sr(1) 2072 .m(m) 2073 .n(n) 2074 .k(k) 2075 .iterations(1) 2076 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2077 } 2078 } 2079 } 2080 } 2081 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_gt_16)2082 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_gt_16) { 2083 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2084 for (uint32_t n = 17; n < 32; n++) { 2085 for (size_t k = 1; k <= 10; k += 3) { 2086 GemmMicrokernelTester() 2087 .mr(6) 2088 .nr(16) 2089 .kr(1) 2090 .sr(1) 2091 .m(6) 2092 .n(n) 2093 .k(k) 2094 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2095 } 2096 } 2097 } 2098 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_gt_16_strided_cn)2099 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_gt_16_strided_cn) { 2100 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2101 for (uint32_t n = 17; n < 32; n++) { 2102 for (size_t k = 1; k <= 10; k += 3) { 2103 GemmMicrokernelTester() 2104 .mr(6) 2105 .nr(16) 2106 .kr(1) 2107 .sr(1) 2108 .m(6) 2109 .n(n) 2110 .k(k) 2111 .cn_stride(19) 2112 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2113 } 2114 } 2115 } 2116 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_gt_16_subtile)2117 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_gt_16_subtile) { 2118 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2119 for (uint32_t n = 17; n < 32; n++) { 2120 for (size_t k = 1; k <= 10; k += 3) { 2121 for (uint32_t m = 1; m <= 6; m++) { 2122 GemmMicrokernelTester() 2123 .mr(6) 2124 .nr(16) 2125 .kr(1) 2126 .sr(1) 2127 .m(m) 2128 .n(n) 2129 .k(k) 2130 .iterations(1) 2131 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2132 } 2133 } 2134 } 2135 } 2136 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_div_16)2137 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_div_16) { 2138 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2139 for (uint32_t n = 32; n <= 48; n += 16) { 2140 for (size_t k = 1; k <= 10; k += 3) { 2141 GemmMicrokernelTester() 2142 .mr(6) 2143 .nr(16) 2144 .kr(1) 2145 .sr(1) 2146 .m(6) 2147 .n(n) 2148 .k(k) 2149 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2150 } 2151 } 2152 } 2153 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_div_16_strided_cn)2154 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_div_16_strided_cn) { 2155 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2156 for (uint32_t n = 32; n <= 48; n += 16) { 2157 for (size_t k = 1; k <= 10; k += 3) { 2158 GemmMicrokernelTester() 2159 .mr(6) 2160 .nr(16) 2161 .kr(1) 2162 .sr(1) 2163 .m(6) 2164 .n(n) 2165 .k(k) 2166 .cn_stride(19) 2167 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2168 } 2169 } 2170 } 2171 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_div_16_subtile)2172 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_div_16_subtile) { 2173 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2174 for (uint32_t n = 32; n <= 48; n += 16) { 2175 for (size_t k = 1; k <= 10; k += 3) { 2176 for (uint32_t m = 1; m <= 6; m++) { 2177 GemmMicrokernelTester() 2178 .mr(6) 2179 .nr(16) 2180 .kr(1) 2181 .sr(1) 2182 .m(m) 2183 .n(n) 2184 .k(k) 2185 .iterations(1) 2186 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2187 } 2188 } 2189 } 2190 } 2191 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,small_kernel)2192 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, small_kernel) { 2193 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2194 for (size_t k = 1; k <= 10; k += 3) { 2195 GemmMicrokernelTester() 2196 .mr(6) 2197 .nr(16) 2198 .kr(1) 2199 .sr(1) 2200 .m(6) 2201 .n(16) 2202 .k(k) 2203 .ks(3) 2204 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2205 } 2206 } 2207 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,small_kernel_subtile)2208 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, small_kernel_subtile) { 2209 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2210 for (size_t k = 1; k <= 10; k += 3) { 2211 for (uint32_t n = 1; n <= 16; n++) { 2212 for (uint32_t m = 1; m <= 6; m++) { 2213 GemmMicrokernelTester() 2214 .mr(6) 2215 .nr(16) 2216 .kr(1) 2217 .sr(1) 2218 .m(m) 2219 .n(n) 2220 .k(k) 2221 .ks(3) 2222 .iterations(1) 2223 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2224 } 2225 } 2226 } 2227 } 2228 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_gt_16_small_kernel)2229 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_gt_16_small_kernel) { 2230 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2231 for (uint32_t n = 17; n < 32; n++) { 2232 for (size_t k = 1; k <= 10; k += 3) { 2233 GemmMicrokernelTester() 2234 .mr(6) 2235 .nr(16) 2236 .kr(1) 2237 .sr(1) 2238 .m(6) 2239 .n(n) 2240 .k(k) 2241 .ks(3) 2242 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2243 } 2244 } 2245 } 2246 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,n_div_16_small_kernel)2247 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, n_div_16_small_kernel) { 2248 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2249 for (uint32_t n = 32; n <= 48; n += 16) { 2250 for (size_t k = 1; k <= 10; k += 3) { 2251 GemmMicrokernelTester() 2252 .mr(6) 2253 .nr(16) 2254 .kr(1) 2255 .sr(1) 2256 .m(6) 2257 .n(n) 2258 .k(k) 2259 .ks(3) 2260 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2261 } 2262 } 2263 } 2264 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,strided_cm_subtile)2265 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, strided_cm_subtile) { 2266 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2267 for (size_t k = 1; k <= 10; k += 3) { 2268 for (uint32_t n = 1; n <= 16; n++) { 2269 for (uint32_t m = 1; m <= 6; m++) { 2270 GemmMicrokernelTester() 2271 .mr(6) 2272 .nr(16) 2273 .kr(1) 2274 .sr(1) 2275 .m(m) 2276 .n(n) 2277 .k(k) 2278 .cm_stride(19) 2279 .iterations(1) 2280 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2281 } 2282 } 2283 } 2284 } 2285 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,a_offset)2286 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, a_offset) { 2287 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2288 for (size_t k = 1; k <= 10; k += 3) { 2289 GemmMicrokernelTester() 2290 .mr(6) 2291 .nr(16) 2292 .kr(1) 2293 .sr(1) 2294 .m(6) 2295 .n(16) 2296 .k(k) 2297 .ks(3) 2298 .a_offset(67) 2299 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2300 } 2301 } 2302 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,zero)2303 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, zero) { 2304 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2305 for (size_t k = 1; k <= 10; k += 3) { 2306 for (uint32_t mz = 0; mz < 6; mz++) { 2307 GemmMicrokernelTester() 2308 .mr(6) 2309 .nr(16) 2310 .kr(1) 2311 .sr(1) 2312 .m(6) 2313 .n(16) 2314 .k(k) 2315 .ks(3) 2316 .a_offset(67) 2317 .zero_index(mz) 2318 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2319 } 2320 } 2321 } 2322 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,qmin)2323 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, qmin) { 2324 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2325 GemmMicrokernelTester() 2326 .mr(6) 2327 .nr(16) 2328 .kr(1) 2329 .sr(1) 2330 .m(6) 2331 .n(16) 2332 .k(2) 2333 .qmin(128) 2334 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2335 } 2336 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,qmax)2337 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, qmax) { 2338 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2339 GemmMicrokernelTester() 2340 .mr(6) 2341 .nr(16) 2342 .kr(1) 2343 .sr(1) 2344 .m(6) 2345 .n(16) 2346 .k(2) 2347 .qmax(128) 2348 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2349 } 2350 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55,strided_cm)2351 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55, strided_cm) { 2352 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2353 GemmMicrokernelTester() 2354 .mr(6) 2355 .nr(16) 2356 .kr(1) 2357 .sr(1) 2358 .m(6) 2359 .n(16) 2360 .k(2) 2361 .cm_stride(19) 2362 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55, xnn_init_f16_minmax_neon_params); 2363 } 2364 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 2365 2366 2367 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4)2368 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4) { 2369 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2370 GemmMicrokernelTester() 2371 .mr(6) 2372 .nr(16) 2373 .kr(1) 2374 .sr(1) 2375 .m(6) 2376 .n(16) 2377 .k(4) 2378 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2379 } 2380 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,strided_cn)2381 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, strided_cn) { 2382 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2383 GemmMicrokernelTester() 2384 .mr(6) 2385 .nr(16) 2386 .kr(1) 2387 .sr(1) 2388 .m(6) 2389 .n(16) 2390 .k(4) 2391 .cn_stride(19) 2392 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2393 } 2394 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4_subtile)2395 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4_subtile) { 2396 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2397 for (uint32_t n = 1; n <= 16; n++) { 2398 for (uint32_t m = 1; m <= 6; m++) { 2399 GemmMicrokernelTester() 2400 .mr(6) 2401 .nr(16) 2402 .kr(1) 2403 .sr(1) 2404 .m(m) 2405 .n(n) 2406 .k(4) 2407 .iterations(1) 2408 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2409 } 2410 } 2411 } 2412 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4_subtile_m)2413 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4_subtile_m) { 2414 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2415 for (uint32_t m = 1; m <= 6; m++) { 2416 GemmMicrokernelTester() 2417 .mr(6) 2418 .nr(16) 2419 .kr(1) 2420 .sr(1) 2421 .m(m) 2422 .n(16) 2423 .k(4) 2424 .iterations(1) 2425 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2426 } 2427 } 2428 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_eq_4_subtile_n)2429 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_eq_4_subtile_n) { 2430 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2431 for (uint32_t n = 1; n <= 16; n++) { 2432 GemmMicrokernelTester() 2433 .mr(6) 2434 .nr(16) 2435 .kr(1) 2436 .sr(1) 2437 .m(6) 2438 .n(n) 2439 .k(4) 2440 .iterations(1) 2441 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2442 } 2443 } 2444 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_lt_4)2445 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_lt_4) { 2446 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2447 for (size_t k = 1; k < 4; k++) { 2448 GemmMicrokernelTester() 2449 .mr(6) 2450 .nr(16) 2451 .kr(1) 2452 .sr(1) 2453 .m(6) 2454 .n(16) 2455 .k(k) 2456 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2457 } 2458 } 2459 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_lt_4_subtile)2460 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_lt_4_subtile) { 2461 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2462 for (size_t k = 1; k < 4; k++) { 2463 for (uint32_t n = 1; n <= 16; n++) { 2464 for (uint32_t m = 1; m <= 6; m++) { 2465 GemmMicrokernelTester() 2466 .mr(6) 2467 .nr(16) 2468 .kr(1) 2469 .sr(1) 2470 .m(m) 2471 .n(n) 2472 .k(k) 2473 .iterations(1) 2474 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2475 } 2476 } 2477 } 2478 } 2479 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_gt_4)2480 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_gt_4) { 2481 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2482 for (size_t k = 5; k < 8; k++) { 2483 GemmMicrokernelTester() 2484 .mr(6) 2485 .nr(16) 2486 .kr(1) 2487 .sr(1) 2488 .m(6) 2489 .n(16) 2490 .k(k) 2491 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2492 } 2493 } 2494 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_gt_4_subtile)2495 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_gt_4_subtile) { 2496 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2497 for (size_t k = 5; k < 8; k++) { 2498 for (uint32_t n = 1; n <= 16; n++) { 2499 for (uint32_t m = 1; m <= 6; m++) { 2500 GemmMicrokernelTester() 2501 .mr(6) 2502 .nr(16) 2503 .kr(1) 2504 .sr(1) 2505 .m(m) 2506 .n(n) 2507 .k(k) 2508 .iterations(1) 2509 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2510 } 2511 } 2512 } 2513 } 2514 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_div_4)2515 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_div_4) { 2516 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2517 for (size_t k = 8; k <= 40; k += 4) { 2518 GemmMicrokernelTester() 2519 .mr(6) 2520 .nr(16) 2521 .kr(1) 2522 .sr(1) 2523 .m(6) 2524 .n(16) 2525 .k(k) 2526 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2527 } 2528 } 2529 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,k_div_4_subtile)2530 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, k_div_4_subtile) { 2531 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2532 for (size_t k = 8; k <= 40; k += 4) { 2533 for (uint32_t n = 1; n <= 16; n++) { 2534 for (uint32_t m = 1; m <= 6; m++) { 2535 GemmMicrokernelTester() 2536 .mr(6) 2537 .nr(16) 2538 .kr(1) 2539 .sr(1) 2540 .m(m) 2541 .n(n) 2542 .k(k) 2543 .iterations(1) 2544 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2545 } 2546 } 2547 } 2548 } 2549 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_gt_16)2550 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_gt_16) { 2551 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2552 for (uint32_t n = 17; n < 32; n++) { 2553 for (size_t k = 1; k <= 20; k += 5) { 2554 GemmMicrokernelTester() 2555 .mr(6) 2556 .nr(16) 2557 .kr(1) 2558 .sr(1) 2559 .m(6) 2560 .n(n) 2561 .k(k) 2562 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2563 } 2564 } 2565 } 2566 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_gt_16_strided_cn)2567 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_gt_16_strided_cn) { 2568 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2569 for (uint32_t n = 17; n < 32; n++) { 2570 for (size_t k = 1; k <= 20; k += 5) { 2571 GemmMicrokernelTester() 2572 .mr(6) 2573 .nr(16) 2574 .kr(1) 2575 .sr(1) 2576 .m(6) 2577 .n(n) 2578 .k(k) 2579 .cn_stride(19) 2580 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2581 } 2582 } 2583 } 2584 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_gt_16_subtile)2585 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_gt_16_subtile) { 2586 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2587 for (uint32_t n = 17; n < 32; n++) { 2588 for (size_t k = 1; k <= 20; k += 5) { 2589 for (uint32_t m = 1; m <= 6; m++) { 2590 GemmMicrokernelTester() 2591 .mr(6) 2592 .nr(16) 2593 .kr(1) 2594 .sr(1) 2595 .m(m) 2596 .n(n) 2597 .k(k) 2598 .iterations(1) 2599 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2600 } 2601 } 2602 } 2603 } 2604 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_div_16)2605 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_div_16) { 2606 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2607 for (uint32_t n = 32; n <= 48; n += 16) { 2608 for (size_t k = 1; k <= 20; k += 5) { 2609 GemmMicrokernelTester() 2610 .mr(6) 2611 .nr(16) 2612 .kr(1) 2613 .sr(1) 2614 .m(6) 2615 .n(n) 2616 .k(k) 2617 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2618 } 2619 } 2620 } 2621 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_div_16_strided_cn)2622 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_div_16_strided_cn) { 2623 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2624 for (uint32_t n = 32; n <= 48; n += 16) { 2625 for (size_t k = 1; k <= 20; k += 5) { 2626 GemmMicrokernelTester() 2627 .mr(6) 2628 .nr(16) 2629 .kr(1) 2630 .sr(1) 2631 .m(6) 2632 .n(n) 2633 .k(k) 2634 .cn_stride(19) 2635 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2636 } 2637 } 2638 } 2639 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_div_16_subtile)2640 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_div_16_subtile) { 2641 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2642 for (uint32_t n = 32; n <= 48; n += 16) { 2643 for (size_t k = 1; k <= 20; k += 5) { 2644 for (uint32_t m = 1; m <= 6; m++) { 2645 GemmMicrokernelTester() 2646 .mr(6) 2647 .nr(16) 2648 .kr(1) 2649 .sr(1) 2650 .m(m) 2651 .n(n) 2652 .k(k) 2653 .iterations(1) 2654 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2655 } 2656 } 2657 } 2658 } 2659 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,small_kernel)2660 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, small_kernel) { 2661 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2662 for (size_t k = 1; k <= 20; k += 5) { 2663 GemmMicrokernelTester() 2664 .mr(6) 2665 .nr(16) 2666 .kr(1) 2667 .sr(1) 2668 .m(6) 2669 .n(16) 2670 .k(k) 2671 .ks(3) 2672 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2673 } 2674 } 2675 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,small_kernel_subtile)2676 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, small_kernel_subtile) { 2677 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2678 for (size_t k = 1; k <= 20; k += 5) { 2679 for (uint32_t n = 1; n <= 16; n++) { 2680 for (uint32_t m = 1; m <= 6; m++) { 2681 GemmMicrokernelTester() 2682 .mr(6) 2683 .nr(16) 2684 .kr(1) 2685 .sr(1) 2686 .m(m) 2687 .n(n) 2688 .k(k) 2689 .ks(3) 2690 .iterations(1) 2691 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2692 } 2693 } 2694 } 2695 } 2696 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_gt_16_small_kernel)2697 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_gt_16_small_kernel) { 2698 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2699 for (uint32_t n = 17; n < 32; n++) { 2700 for (size_t k = 1; k <= 20; k += 5) { 2701 GemmMicrokernelTester() 2702 .mr(6) 2703 .nr(16) 2704 .kr(1) 2705 .sr(1) 2706 .m(6) 2707 .n(n) 2708 .k(k) 2709 .ks(3) 2710 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2711 } 2712 } 2713 } 2714 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,n_div_16_small_kernel)2715 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, n_div_16_small_kernel) { 2716 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2717 for (uint32_t n = 32; n <= 48; n += 16) { 2718 for (size_t k = 1; k <= 20; k += 5) { 2719 GemmMicrokernelTester() 2720 .mr(6) 2721 .nr(16) 2722 .kr(1) 2723 .sr(1) 2724 .m(6) 2725 .n(n) 2726 .k(k) 2727 .ks(3) 2728 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2729 } 2730 } 2731 } 2732 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,strided_cm_subtile)2733 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, strided_cm_subtile) { 2734 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2735 for (size_t k = 1; k <= 20; k += 5) { 2736 for (uint32_t n = 1; n <= 16; n++) { 2737 for (uint32_t m = 1; m <= 6; m++) { 2738 GemmMicrokernelTester() 2739 .mr(6) 2740 .nr(16) 2741 .kr(1) 2742 .sr(1) 2743 .m(m) 2744 .n(n) 2745 .k(k) 2746 .cm_stride(19) 2747 .iterations(1) 2748 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2749 } 2750 } 2751 } 2752 } 2753 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,a_offset)2754 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, a_offset) { 2755 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2756 for (size_t k = 1; k <= 20; k += 5) { 2757 GemmMicrokernelTester() 2758 .mr(6) 2759 .nr(16) 2760 .kr(1) 2761 .sr(1) 2762 .m(6) 2763 .n(16) 2764 .k(k) 2765 .ks(3) 2766 .a_offset(127) 2767 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2768 } 2769 } 2770 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,zero)2771 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, zero) { 2772 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2773 for (size_t k = 1; k <= 20; k += 5) { 2774 for (uint32_t mz = 0; mz < 6; mz++) { 2775 GemmMicrokernelTester() 2776 .mr(6) 2777 .nr(16) 2778 .kr(1) 2779 .sr(1) 2780 .m(6) 2781 .n(16) 2782 .k(k) 2783 .ks(3) 2784 .a_offset(127) 2785 .zero_index(mz) 2786 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2787 } 2788 } 2789 } 2790 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,qmin)2791 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, qmin) { 2792 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2793 GemmMicrokernelTester() 2794 .mr(6) 2795 .nr(16) 2796 .kr(1) 2797 .sr(1) 2798 .m(6) 2799 .n(16) 2800 .k(4) 2801 .qmin(128) 2802 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2803 } 2804 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,qmax)2805 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, qmax) { 2806 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2807 GemmMicrokernelTester() 2808 .mr(6) 2809 .nr(16) 2810 .kr(1) 2811 .sr(1) 2812 .m(6) 2813 .n(16) 2814 .k(4) 2815 .qmax(128) 2816 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2817 } 2818 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0,strided_cm)2819 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A55R0, strided_cm) { 2820 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2821 GemmMicrokernelTester() 2822 .mr(6) 2823 .nr(16) 2824 .kr(1) 2825 .sr(1) 2826 .m(6) 2827 .n(16) 2828 .k(4) 2829 .cm_stride(19) 2830 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0, xnn_init_f16_minmax_neon_params); 2831 } 2832 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 2833 2834 2835 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4)2836 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4) { 2837 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2838 GemmMicrokernelTester() 2839 .mr(6) 2840 .nr(16) 2841 .kr(1) 2842 .sr(1) 2843 .m(6) 2844 .n(16) 2845 .k(4) 2846 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2847 } 2848 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,strided_cn)2849 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, strided_cn) { 2850 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2851 GemmMicrokernelTester() 2852 .mr(6) 2853 .nr(16) 2854 .kr(1) 2855 .sr(1) 2856 .m(6) 2857 .n(16) 2858 .k(4) 2859 .cn_stride(19) 2860 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2861 } 2862 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4_subtile)2863 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4_subtile) { 2864 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2865 for (uint32_t n = 1; n <= 16; n++) { 2866 for (uint32_t m = 1; m <= 6; m++) { 2867 GemmMicrokernelTester() 2868 .mr(6) 2869 .nr(16) 2870 .kr(1) 2871 .sr(1) 2872 .m(m) 2873 .n(n) 2874 .k(4) 2875 .iterations(1) 2876 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2877 } 2878 } 2879 } 2880 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4_subtile_m)2881 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4_subtile_m) { 2882 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2883 for (uint32_t m = 1; m <= 6; m++) { 2884 GemmMicrokernelTester() 2885 .mr(6) 2886 .nr(16) 2887 .kr(1) 2888 .sr(1) 2889 .m(m) 2890 .n(16) 2891 .k(4) 2892 .iterations(1) 2893 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2894 } 2895 } 2896 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_eq_4_subtile_n)2897 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_eq_4_subtile_n) { 2898 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2899 for (uint32_t n = 1; n <= 16; n++) { 2900 GemmMicrokernelTester() 2901 .mr(6) 2902 .nr(16) 2903 .kr(1) 2904 .sr(1) 2905 .m(6) 2906 .n(n) 2907 .k(4) 2908 .iterations(1) 2909 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2910 } 2911 } 2912 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_lt_4)2913 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_lt_4) { 2914 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2915 for (size_t k = 1; k < 4; k++) { 2916 GemmMicrokernelTester() 2917 .mr(6) 2918 .nr(16) 2919 .kr(1) 2920 .sr(1) 2921 .m(6) 2922 .n(16) 2923 .k(k) 2924 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2925 } 2926 } 2927 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_lt_4_subtile)2928 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_lt_4_subtile) { 2929 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2930 for (size_t k = 1; k < 4; k++) { 2931 for (uint32_t n = 1; n <= 16; n++) { 2932 for (uint32_t m = 1; m <= 6; m++) { 2933 GemmMicrokernelTester() 2934 .mr(6) 2935 .nr(16) 2936 .kr(1) 2937 .sr(1) 2938 .m(m) 2939 .n(n) 2940 .k(k) 2941 .iterations(1) 2942 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2943 } 2944 } 2945 } 2946 } 2947 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_gt_4)2948 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_gt_4) { 2949 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2950 for (size_t k = 5; k < 8; k++) { 2951 GemmMicrokernelTester() 2952 .mr(6) 2953 .nr(16) 2954 .kr(1) 2955 .sr(1) 2956 .m(6) 2957 .n(16) 2958 .k(k) 2959 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2960 } 2961 } 2962 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_gt_4_subtile)2963 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_gt_4_subtile) { 2964 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2965 for (size_t k = 5; k < 8; k++) { 2966 for (uint32_t n = 1; n <= 16; n++) { 2967 for (uint32_t m = 1; m <= 6; m++) { 2968 GemmMicrokernelTester() 2969 .mr(6) 2970 .nr(16) 2971 .kr(1) 2972 .sr(1) 2973 .m(m) 2974 .n(n) 2975 .k(k) 2976 .iterations(1) 2977 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2978 } 2979 } 2980 } 2981 } 2982 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_div_4)2983 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_div_4) { 2984 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 2985 for (size_t k = 8; k <= 40; k += 4) { 2986 GemmMicrokernelTester() 2987 .mr(6) 2988 .nr(16) 2989 .kr(1) 2990 .sr(1) 2991 .m(6) 2992 .n(16) 2993 .k(k) 2994 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 2995 } 2996 } 2997 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,k_div_4_subtile)2998 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, k_div_4_subtile) { 2999 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3000 for (size_t k = 8; k <= 40; k += 4) { 3001 for (uint32_t n = 1; n <= 16; n++) { 3002 for (uint32_t m = 1; m <= 6; m++) { 3003 GemmMicrokernelTester() 3004 .mr(6) 3005 .nr(16) 3006 .kr(1) 3007 .sr(1) 3008 .m(m) 3009 .n(n) 3010 .k(k) 3011 .iterations(1) 3012 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3013 } 3014 } 3015 } 3016 } 3017 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_gt_16)3018 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_gt_16) { 3019 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3020 for (uint32_t n = 17; n < 32; n++) { 3021 for (size_t k = 1; k <= 20; k += 5) { 3022 GemmMicrokernelTester() 3023 .mr(6) 3024 .nr(16) 3025 .kr(1) 3026 .sr(1) 3027 .m(6) 3028 .n(n) 3029 .k(k) 3030 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3031 } 3032 } 3033 } 3034 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_gt_16_strided_cn)3035 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_gt_16_strided_cn) { 3036 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3037 for (uint32_t n = 17; n < 32; n++) { 3038 for (size_t k = 1; k <= 20; k += 5) { 3039 GemmMicrokernelTester() 3040 .mr(6) 3041 .nr(16) 3042 .kr(1) 3043 .sr(1) 3044 .m(6) 3045 .n(n) 3046 .k(k) 3047 .cn_stride(19) 3048 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3049 } 3050 } 3051 } 3052 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_gt_16_subtile)3053 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_gt_16_subtile) { 3054 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3055 for (uint32_t n = 17; n < 32; n++) { 3056 for (size_t k = 1; k <= 20; k += 5) { 3057 for (uint32_t m = 1; m <= 6; m++) { 3058 GemmMicrokernelTester() 3059 .mr(6) 3060 .nr(16) 3061 .kr(1) 3062 .sr(1) 3063 .m(m) 3064 .n(n) 3065 .k(k) 3066 .iterations(1) 3067 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3068 } 3069 } 3070 } 3071 } 3072 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_div_16)3073 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_div_16) { 3074 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3075 for (uint32_t n = 32; n <= 48; n += 16) { 3076 for (size_t k = 1; k <= 20; k += 5) { 3077 GemmMicrokernelTester() 3078 .mr(6) 3079 .nr(16) 3080 .kr(1) 3081 .sr(1) 3082 .m(6) 3083 .n(n) 3084 .k(k) 3085 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3086 } 3087 } 3088 } 3089 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_div_16_strided_cn)3090 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_div_16_strided_cn) { 3091 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3092 for (uint32_t n = 32; n <= 48; n += 16) { 3093 for (size_t k = 1; k <= 20; k += 5) { 3094 GemmMicrokernelTester() 3095 .mr(6) 3096 .nr(16) 3097 .kr(1) 3098 .sr(1) 3099 .m(6) 3100 .n(n) 3101 .k(k) 3102 .cn_stride(19) 3103 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3104 } 3105 } 3106 } 3107 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_div_16_subtile)3108 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_div_16_subtile) { 3109 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3110 for (uint32_t n = 32; n <= 48; n += 16) { 3111 for (size_t k = 1; k <= 20; k += 5) { 3112 for (uint32_t m = 1; m <= 6; m++) { 3113 GemmMicrokernelTester() 3114 .mr(6) 3115 .nr(16) 3116 .kr(1) 3117 .sr(1) 3118 .m(m) 3119 .n(n) 3120 .k(k) 3121 .iterations(1) 3122 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3123 } 3124 } 3125 } 3126 } 3127 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,small_kernel)3128 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, small_kernel) { 3129 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3130 for (size_t k = 1; k <= 20; k += 5) { 3131 GemmMicrokernelTester() 3132 .mr(6) 3133 .nr(16) 3134 .kr(1) 3135 .sr(1) 3136 .m(6) 3137 .n(16) 3138 .k(k) 3139 .ks(3) 3140 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3141 } 3142 } 3143 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,small_kernel_subtile)3144 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, small_kernel_subtile) { 3145 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3146 for (size_t k = 1; k <= 20; k += 5) { 3147 for (uint32_t n = 1; n <= 16; n++) { 3148 for (uint32_t m = 1; m <= 6; m++) { 3149 GemmMicrokernelTester() 3150 .mr(6) 3151 .nr(16) 3152 .kr(1) 3153 .sr(1) 3154 .m(m) 3155 .n(n) 3156 .k(k) 3157 .ks(3) 3158 .iterations(1) 3159 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3160 } 3161 } 3162 } 3163 } 3164 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_gt_16_small_kernel)3165 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_gt_16_small_kernel) { 3166 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3167 for (uint32_t n = 17; n < 32; n++) { 3168 for (size_t k = 1; k <= 20; k += 5) { 3169 GemmMicrokernelTester() 3170 .mr(6) 3171 .nr(16) 3172 .kr(1) 3173 .sr(1) 3174 .m(6) 3175 .n(n) 3176 .k(k) 3177 .ks(3) 3178 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3179 } 3180 } 3181 } 3182 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,n_div_16_small_kernel)3183 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, n_div_16_small_kernel) { 3184 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3185 for (uint32_t n = 32; n <= 48; n += 16) { 3186 for (size_t k = 1; k <= 20; k += 5) { 3187 GemmMicrokernelTester() 3188 .mr(6) 3189 .nr(16) 3190 .kr(1) 3191 .sr(1) 3192 .m(6) 3193 .n(n) 3194 .k(k) 3195 .ks(3) 3196 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3197 } 3198 } 3199 } 3200 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,strided_cm_subtile)3201 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, strided_cm_subtile) { 3202 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3203 for (size_t k = 1; k <= 20; k += 5) { 3204 for (uint32_t n = 1; n <= 16; n++) { 3205 for (uint32_t m = 1; m <= 6; m++) { 3206 GemmMicrokernelTester() 3207 .mr(6) 3208 .nr(16) 3209 .kr(1) 3210 .sr(1) 3211 .m(m) 3212 .n(n) 3213 .k(k) 3214 .cm_stride(19) 3215 .iterations(1) 3216 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3217 } 3218 } 3219 } 3220 } 3221 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,a_offset)3222 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, a_offset) { 3223 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3224 for (size_t k = 1; k <= 20; k += 5) { 3225 GemmMicrokernelTester() 3226 .mr(6) 3227 .nr(16) 3228 .kr(1) 3229 .sr(1) 3230 .m(6) 3231 .n(16) 3232 .k(k) 3233 .ks(3) 3234 .a_offset(127) 3235 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3236 } 3237 } 3238 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,zero)3239 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, zero) { 3240 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3241 for (size_t k = 1; k <= 20; k += 5) { 3242 for (uint32_t mz = 0; mz < 6; mz++) { 3243 GemmMicrokernelTester() 3244 .mr(6) 3245 .nr(16) 3246 .kr(1) 3247 .sr(1) 3248 .m(6) 3249 .n(16) 3250 .k(k) 3251 .ks(3) 3252 .a_offset(127) 3253 .zero_index(mz) 3254 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3255 } 3256 } 3257 } 3258 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,qmin)3259 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, qmin) { 3260 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3261 GemmMicrokernelTester() 3262 .mr(6) 3263 .nr(16) 3264 .kr(1) 3265 .sr(1) 3266 .m(6) 3267 .n(16) 3268 .k(4) 3269 .qmin(128) 3270 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3271 } 3272 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,qmax)3273 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, qmax) { 3274 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3275 GemmMicrokernelTester() 3276 .mr(6) 3277 .nr(16) 3278 .kr(1) 3279 .sr(1) 3280 .m(6) 3281 .n(16) 3282 .k(4) 3283 .qmax(128) 3284 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3285 } 3286 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75,strided_cm)3287 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_CORTEX_A75, strided_cm) { 3288 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3289 GemmMicrokernelTester() 3290 .mr(6) 3291 .nr(16) 3292 .kr(1) 3293 .sr(1) 3294 .m(6) 3295 .n(16) 3296 .k(4) 3297 .cm_stride(19) 3298 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75, xnn_init_f16_minmax_neon_params); 3299 } 3300 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 3301 3302 3303 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2)3304 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) { 3305 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3306 GemmMicrokernelTester() 3307 .mr(6) 3308 .nr(16) 3309 .kr(1) 3310 .sr(1) 3311 .m(6) 3312 .n(16) 3313 .k(2) 3314 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3315 } 3316 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cn)3317 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) { 3318 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3319 GemmMicrokernelTester() 3320 .mr(6) 3321 .nr(16) 3322 .kr(1) 3323 .sr(1) 3324 .m(6) 3325 .n(16) 3326 .k(2) 3327 .cn_stride(19) 3328 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3329 } 3330 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile)3331 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) { 3332 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3333 for (uint32_t n = 1; n <= 16; n++) { 3334 for (uint32_t m = 1; m <= 6; m++) { 3335 GemmMicrokernelTester() 3336 .mr(6) 3337 .nr(16) 3338 .kr(1) 3339 .sr(1) 3340 .m(m) 3341 .n(n) 3342 .k(2) 3343 .iterations(1) 3344 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3345 } 3346 } 3347 } 3348 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_m)3349 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) { 3350 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3351 for (uint32_t m = 1; m <= 6; m++) { 3352 GemmMicrokernelTester() 3353 .mr(6) 3354 .nr(16) 3355 .kr(1) 3356 .sr(1) 3357 .m(m) 3358 .n(16) 3359 .k(2) 3360 .iterations(1) 3361 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3362 } 3363 } 3364 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_eq_2_subtile_n)3365 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) { 3366 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3367 for (uint32_t n = 1; n <= 16; n++) { 3368 GemmMicrokernelTester() 3369 .mr(6) 3370 .nr(16) 3371 .kr(1) 3372 .sr(1) 3373 .m(6) 3374 .n(n) 3375 .k(2) 3376 .iterations(1) 3377 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3378 } 3379 } 3380 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2)3381 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) { 3382 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3383 for (size_t k = 1; k < 2; k++) { 3384 GemmMicrokernelTester() 3385 .mr(6) 3386 .nr(16) 3387 .kr(1) 3388 .sr(1) 3389 .m(6) 3390 .n(16) 3391 .k(k) 3392 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3393 } 3394 } 3395 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_lt_2_subtile)3396 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) { 3397 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3398 for (size_t k = 1; k < 2; k++) { 3399 for (uint32_t n = 1; n <= 16; n++) { 3400 for (uint32_t m = 1; m <= 6; m++) { 3401 GemmMicrokernelTester() 3402 .mr(6) 3403 .nr(16) 3404 .kr(1) 3405 .sr(1) 3406 .m(m) 3407 .n(n) 3408 .k(k) 3409 .iterations(1) 3410 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3411 } 3412 } 3413 } 3414 } 3415 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2)3416 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) { 3417 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3418 for (size_t k = 3; k < 4; k++) { 3419 GemmMicrokernelTester() 3420 .mr(6) 3421 .nr(16) 3422 .kr(1) 3423 .sr(1) 3424 .m(6) 3425 .n(16) 3426 .k(k) 3427 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3428 } 3429 } 3430 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_gt_2_subtile)3431 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) { 3432 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3433 for (size_t k = 3; k < 4; k++) { 3434 for (uint32_t n = 1; n <= 16; n++) { 3435 for (uint32_t m = 1; m <= 6; m++) { 3436 GemmMicrokernelTester() 3437 .mr(6) 3438 .nr(16) 3439 .kr(1) 3440 .sr(1) 3441 .m(m) 3442 .n(n) 3443 .k(k) 3444 .iterations(1) 3445 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3446 } 3447 } 3448 } 3449 } 3450 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_div_2)3451 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) { 3452 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3453 for (size_t k = 4; k <= 20; k += 2) { 3454 GemmMicrokernelTester() 3455 .mr(6) 3456 .nr(16) 3457 .kr(1) 3458 .sr(1) 3459 .m(6) 3460 .n(16) 3461 .k(k) 3462 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3463 } 3464 } 3465 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,k_div_2_subtile)3466 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) { 3467 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3468 for (size_t k = 4; k <= 20; k += 2) { 3469 for (uint32_t n = 1; n <= 16; n++) { 3470 for (uint32_t m = 1; m <= 6; m++) { 3471 GemmMicrokernelTester() 3472 .mr(6) 3473 .nr(16) 3474 .kr(1) 3475 .sr(1) 3476 .m(m) 3477 .n(n) 3478 .k(k) 3479 .iterations(1) 3480 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3481 } 3482 } 3483 } 3484 } 3485 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16)3486 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) { 3487 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3488 for (uint32_t n = 17; n < 32; n++) { 3489 for (size_t k = 1; k <= 10; k += 3) { 3490 GemmMicrokernelTester() 3491 .mr(6) 3492 .nr(16) 3493 .kr(1) 3494 .sr(1) 3495 .m(6) 3496 .n(n) 3497 .k(k) 3498 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3499 } 3500 } 3501 } 3502 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_strided_cn)3503 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) { 3504 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3505 for (uint32_t n = 17; n < 32; n++) { 3506 for (size_t k = 1; k <= 10; k += 3) { 3507 GemmMicrokernelTester() 3508 .mr(6) 3509 .nr(16) 3510 .kr(1) 3511 .sr(1) 3512 .m(6) 3513 .n(n) 3514 .k(k) 3515 .cn_stride(19) 3516 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3517 } 3518 } 3519 } 3520 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_subtile)3521 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) { 3522 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3523 for (uint32_t n = 17; n < 32; n++) { 3524 for (size_t k = 1; k <= 10; k += 3) { 3525 for (uint32_t m = 1; m <= 6; m++) { 3526 GemmMicrokernelTester() 3527 .mr(6) 3528 .nr(16) 3529 .kr(1) 3530 .sr(1) 3531 .m(m) 3532 .n(n) 3533 .k(k) 3534 .iterations(1) 3535 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3536 } 3537 } 3538 } 3539 } 3540 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16)3541 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) { 3542 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3543 for (uint32_t n = 32; n <= 48; n += 16) { 3544 for (size_t k = 1; k <= 10; k += 3) { 3545 GemmMicrokernelTester() 3546 .mr(6) 3547 .nr(16) 3548 .kr(1) 3549 .sr(1) 3550 .m(6) 3551 .n(n) 3552 .k(k) 3553 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3554 } 3555 } 3556 } 3557 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_strided_cn)3558 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) { 3559 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3560 for (uint32_t n = 32; n <= 48; n += 16) { 3561 for (size_t k = 1; k <= 10; k += 3) { 3562 GemmMicrokernelTester() 3563 .mr(6) 3564 .nr(16) 3565 .kr(1) 3566 .sr(1) 3567 .m(6) 3568 .n(n) 3569 .k(k) 3570 .cn_stride(19) 3571 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3572 } 3573 } 3574 } 3575 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_subtile)3576 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) { 3577 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3578 for (uint32_t n = 32; n <= 48; n += 16) { 3579 for (size_t k = 1; k <= 10; k += 3) { 3580 for (uint32_t m = 1; m <= 6; m++) { 3581 GemmMicrokernelTester() 3582 .mr(6) 3583 .nr(16) 3584 .kr(1) 3585 .sr(1) 3586 .m(m) 3587 .n(n) 3588 .k(k) 3589 .iterations(1) 3590 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3591 } 3592 } 3593 } 3594 } 3595 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,small_kernel)3596 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, small_kernel) { 3597 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3598 for (size_t k = 1; k <= 10; k += 3) { 3599 GemmMicrokernelTester() 3600 .mr(6) 3601 .nr(16) 3602 .kr(1) 3603 .sr(1) 3604 .m(6) 3605 .n(16) 3606 .k(k) 3607 .ks(3) 3608 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3609 } 3610 } 3611 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,small_kernel_subtile)3612 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, small_kernel_subtile) { 3613 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3614 for (size_t k = 1; k <= 10; k += 3) { 3615 for (uint32_t n = 1; n <= 16; n++) { 3616 for (uint32_t m = 1; m <= 6; m++) { 3617 GemmMicrokernelTester() 3618 .mr(6) 3619 .nr(16) 3620 .kr(1) 3621 .sr(1) 3622 .m(m) 3623 .n(n) 3624 .k(k) 3625 .ks(3) 3626 .iterations(1) 3627 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3628 } 3629 } 3630 } 3631 } 3632 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_gt_16_small_kernel)3633 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_small_kernel) { 3634 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3635 for (uint32_t n = 17; n < 32; n++) { 3636 for (size_t k = 1; k <= 10; k += 3) { 3637 GemmMicrokernelTester() 3638 .mr(6) 3639 .nr(16) 3640 .kr(1) 3641 .sr(1) 3642 .m(6) 3643 .n(n) 3644 .k(k) 3645 .ks(3) 3646 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3647 } 3648 } 3649 } 3650 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,n_div_16_small_kernel)3651 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_small_kernel) { 3652 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3653 for (uint32_t n = 32; n <= 48; n += 16) { 3654 for (size_t k = 1; k <= 10; k += 3) { 3655 GemmMicrokernelTester() 3656 .mr(6) 3657 .nr(16) 3658 .kr(1) 3659 .sr(1) 3660 .m(6) 3661 .n(n) 3662 .k(k) 3663 .ks(3) 3664 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3665 } 3666 } 3667 } 3668 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cm_subtile)3669 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) { 3670 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3671 for (size_t k = 1; k <= 10; k += 3) { 3672 for (uint32_t n = 1; n <= 16; n++) { 3673 for (uint32_t m = 1; m <= 6; m++) { 3674 GemmMicrokernelTester() 3675 .mr(6) 3676 .nr(16) 3677 .kr(1) 3678 .sr(1) 3679 .m(m) 3680 .n(n) 3681 .k(k) 3682 .cm_stride(19) 3683 .iterations(1) 3684 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3685 } 3686 } 3687 } 3688 } 3689 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,a_offset)3690 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, a_offset) { 3691 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3692 for (size_t k = 1; k <= 10; k += 3) { 3693 GemmMicrokernelTester() 3694 .mr(6) 3695 .nr(16) 3696 .kr(1) 3697 .sr(1) 3698 .m(6) 3699 .n(16) 3700 .k(k) 3701 .ks(3) 3702 .a_offset(67) 3703 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3704 } 3705 } 3706 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,zero)3707 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, zero) { 3708 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3709 for (size_t k = 1; k <= 10; k += 3) { 3710 for (uint32_t mz = 0; mz < 6; mz++) { 3711 GemmMicrokernelTester() 3712 .mr(6) 3713 .nr(16) 3714 .kr(1) 3715 .sr(1) 3716 .m(6) 3717 .n(16) 3718 .k(k) 3719 .ks(3) 3720 .a_offset(67) 3721 .zero_index(mz) 3722 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3723 } 3724 } 3725 } 3726 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,qmin)3727 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, qmin) { 3728 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3729 GemmMicrokernelTester() 3730 .mr(6) 3731 .nr(16) 3732 .kr(1) 3733 .sr(1) 3734 .m(6) 3735 .n(16) 3736 .k(2) 3737 .qmin(128) 3738 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3739 } 3740 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,qmax)3741 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, qmax) { 3742 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3743 GemmMicrokernelTester() 3744 .mr(6) 3745 .nr(16) 3746 .kr(1) 3747 .sr(1) 3748 .m(6) 3749 .n(16) 3750 .k(2) 3751 .qmax(128) 3752 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3753 } 3754 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32,strided_cm)3755 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) { 3756 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3757 GemmMicrokernelTester() 3758 .mr(6) 3759 .nr(16) 3760 .kr(1) 3761 .sr(1) 3762 .m(6) 3763 .n(16) 3764 .k(2) 3765 .cm_stride(19) 3766 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, xnn_init_f16_minmax_neon_params); 3767 } 3768 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 3769 3770 3771 #if XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4)3772 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4) { 3773 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3774 GemmMicrokernelTester() 3775 .mr(6) 3776 .nr(16) 3777 .kr(1) 3778 .sr(1) 3779 .m(6) 3780 .n(16) 3781 .k(4) 3782 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3783 } 3784 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,strided_cn)3785 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, strided_cn) { 3786 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3787 GemmMicrokernelTester() 3788 .mr(6) 3789 .nr(16) 3790 .kr(1) 3791 .sr(1) 3792 .m(6) 3793 .n(16) 3794 .k(4) 3795 .cn_stride(19) 3796 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3797 } 3798 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile)3799 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) { 3800 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3801 for (uint32_t n = 1; n <= 16; n++) { 3802 for (uint32_t m = 1; m <= 6; m++) { 3803 GemmMicrokernelTester() 3804 .mr(6) 3805 .nr(16) 3806 .kr(1) 3807 .sr(1) 3808 .m(m) 3809 .n(n) 3810 .k(4) 3811 .iterations(1) 3812 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3813 } 3814 } 3815 } 3816 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_m)3817 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 3818 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3819 for (uint32_t m = 1; m <= 6; m++) { 3820 GemmMicrokernelTester() 3821 .mr(6) 3822 .nr(16) 3823 .kr(1) 3824 .sr(1) 3825 .m(m) 3826 .n(16) 3827 .k(4) 3828 .iterations(1) 3829 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3830 } 3831 } 3832 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_eq_4_subtile_n)3833 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 3834 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3835 for (uint32_t n = 1; n <= 16; n++) { 3836 GemmMicrokernelTester() 3837 .mr(6) 3838 .nr(16) 3839 .kr(1) 3840 .sr(1) 3841 .m(6) 3842 .n(n) 3843 .k(4) 3844 .iterations(1) 3845 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3846 } 3847 } 3848 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4)3849 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4) { 3850 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3851 for (size_t k = 1; k < 4; k++) { 3852 GemmMicrokernelTester() 3853 .mr(6) 3854 .nr(16) 3855 .kr(1) 3856 .sr(1) 3857 .m(6) 3858 .n(16) 3859 .k(k) 3860 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3861 } 3862 } 3863 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_lt_4_subtile)3864 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) { 3865 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3866 for (size_t k = 1; k < 4; k++) { 3867 for (uint32_t n = 1; n <= 16; n++) { 3868 for (uint32_t m = 1; m <= 6; m++) { 3869 GemmMicrokernelTester() 3870 .mr(6) 3871 .nr(16) 3872 .kr(1) 3873 .sr(1) 3874 .m(m) 3875 .n(n) 3876 .k(k) 3877 .iterations(1) 3878 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3879 } 3880 } 3881 } 3882 } 3883 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4)3884 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4) { 3885 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3886 for (size_t k = 5; k < 8; k++) { 3887 GemmMicrokernelTester() 3888 .mr(6) 3889 .nr(16) 3890 .kr(1) 3891 .sr(1) 3892 .m(6) 3893 .n(16) 3894 .k(k) 3895 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3896 } 3897 } 3898 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_gt_4_subtile)3899 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) { 3900 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3901 for (size_t k = 5; k < 8; k++) { 3902 for (uint32_t n = 1; n <= 16; n++) { 3903 for (uint32_t m = 1; m <= 6; m++) { 3904 GemmMicrokernelTester() 3905 .mr(6) 3906 .nr(16) 3907 .kr(1) 3908 .sr(1) 3909 .m(m) 3910 .n(n) 3911 .k(k) 3912 .iterations(1) 3913 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3914 } 3915 } 3916 } 3917 } 3918 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_div_4)3919 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_div_4) { 3920 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3921 for (size_t k = 8; k <= 40; k += 4) { 3922 GemmMicrokernelTester() 3923 .mr(6) 3924 .nr(16) 3925 .kr(1) 3926 .sr(1) 3927 .m(6) 3928 .n(16) 3929 .k(k) 3930 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3931 } 3932 } 3933 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,k_div_4_subtile)3934 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) { 3935 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3936 for (size_t k = 8; k <= 40; k += 4) { 3937 for (uint32_t n = 1; n <= 16; n++) { 3938 for (uint32_t m = 1; m <= 6; m++) { 3939 GemmMicrokernelTester() 3940 .mr(6) 3941 .nr(16) 3942 .kr(1) 3943 .sr(1) 3944 .m(m) 3945 .n(n) 3946 .k(k) 3947 .iterations(1) 3948 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3949 } 3950 } 3951 } 3952 } 3953 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16)3954 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16) { 3955 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3956 for (uint32_t n = 17; n < 32; n++) { 3957 for (size_t k = 1; k <= 20; k += 5) { 3958 GemmMicrokernelTester() 3959 .mr(6) 3960 .nr(16) 3961 .kr(1) 3962 .sr(1) 3963 .m(6) 3964 .n(n) 3965 .k(k) 3966 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3967 } 3968 } 3969 } 3970 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_strided_cn)3971 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 3972 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3973 for (uint32_t n = 17; n < 32; n++) { 3974 for (size_t k = 1; k <= 20; k += 5) { 3975 GemmMicrokernelTester() 3976 .mr(6) 3977 .nr(16) 3978 .kr(1) 3979 .sr(1) 3980 .m(6) 3981 .n(n) 3982 .k(k) 3983 .cn_stride(19) 3984 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 3985 } 3986 } 3987 } 3988 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_subtile)3989 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_subtile) { 3990 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 3991 for (uint32_t n = 17; n < 32; n++) { 3992 for (size_t k = 1; k <= 20; k += 5) { 3993 for (uint32_t m = 1; m <= 6; m++) { 3994 GemmMicrokernelTester() 3995 .mr(6) 3996 .nr(16) 3997 .kr(1) 3998 .sr(1) 3999 .m(m) 4000 .n(n) 4001 .k(k) 4002 .iterations(1) 4003 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4004 } 4005 } 4006 } 4007 } 4008 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_div_16)4009 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_div_16) { 4010 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4011 for (uint32_t n = 32; n <= 48; n += 16) { 4012 for (size_t k = 1; k <= 20; k += 5) { 4013 GemmMicrokernelTester() 4014 .mr(6) 4015 .nr(16) 4016 .kr(1) 4017 .sr(1) 4018 .m(6) 4019 .n(n) 4020 .k(k) 4021 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4022 } 4023 } 4024 } 4025 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_strided_cn)4026 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_strided_cn) { 4027 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4028 for (uint32_t n = 32; n <= 48; n += 16) { 4029 for (size_t k = 1; k <= 20; k += 5) { 4030 GemmMicrokernelTester() 4031 .mr(6) 4032 .nr(16) 4033 .kr(1) 4034 .sr(1) 4035 .m(6) 4036 .n(n) 4037 .k(k) 4038 .cn_stride(19) 4039 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4040 } 4041 } 4042 } 4043 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_subtile)4044 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_subtile) { 4045 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4046 for (uint32_t n = 32; n <= 48; n += 16) { 4047 for (size_t k = 1; k <= 20; k += 5) { 4048 for (uint32_t m = 1; m <= 6; m++) { 4049 GemmMicrokernelTester() 4050 .mr(6) 4051 .nr(16) 4052 .kr(1) 4053 .sr(1) 4054 .m(m) 4055 .n(n) 4056 .k(k) 4057 .iterations(1) 4058 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4059 } 4060 } 4061 } 4062 } 4063 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,small_kernel)4064 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, small_kernel) { 4065 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4066 for (size_t k = 1; k <= 20; k += 5) { 4067 GemmMicrokernelTester() 4068 .mr(6) 4069 .nr(16) 4070 .kr(1) 4071 .sr(1) 4072 .m(6) 4073 .n(16) 4074 .k(k) 4075 .ks(3) 4076 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4077 } 4078 } 4079 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,small_kernel_subtile)4080 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, small_kernel_subtile) { 4081 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4082 for (size_t k = 1; k <= 20; k += 5) { 4083 for (uint32_t n = 1; n <= 16; n++) { 4084 for (uint32_t m = 1; m <= 6; m++) { 4085 GemmMicrokernelTester() 4086 .mr(6) 4087 .nr(16) 4088 .kr(1) 4089 .sr(1) 4090 .m(m) 4091 .n(n) 4092 .k(k) 4093 .ks(3) 4094 .iterations(1) 4095 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4096 } 4097 } 4098 } 4099 } 4100 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_gt_16_small_kernel)4101 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_gt_16_small_kernel) { 4102 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4103 for (uint32_t n = 17; n < 32; n++) { 4104 for (size_t k = 1; k <= 20; k += 5) { 4105 GemmMicrokernelTester() 4106 .mr(6) 4107 .nr(16) 4108 .kr(1) 4109 .sr(1) 4110 .m(6) 4111 .n(n) 4112 .k(k) 4113 .ks(3) 4114 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4115 } 4116 } 4117 } 4118 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,n_div_16_small_kernel)4119 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, n_div_16_small_kernel) { 4120 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4121 for (uint32_t n = 32; n <= 48; n += 16) { 4122 for (size_t k = 1; k <= 20; k += 5) { 4123 GemmMicrokernelTester() 4124 .mr(6) 4125 .nr(16) 4126 .kr(1) 4127 .sr(1) 4128 .m(6) 4129 .n(n) 4130 .k(k) 4131 .ks(3) 4132 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4133 } 4134 } 4135 } 4136 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,strided_cm_subtile)4137 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) { 4138 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4139 for (size_t k = 1; k <= 20; k += 5) { 4140 for (uint32_t n = 1; n <= 16; n++) { 4141 for (uint32_t m = 1; m <= 6; m++) { 4142 GemmMicrokernelTester() 4143 .mr(6) 4144 .nr(16) 4145 .kr(1) 4146 .sr(1) 4147 .m(m) 4148 .n(n) 4149 .k(k) 4150 .cm_stride(19) 4151 .iterations(1) 4152 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4153 } 4154 } 4155 } 4156 } 4157 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,a_offset)4158 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, a_offset) { 4159 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4160 for (size_t k = 1; k <= 20; k += 5) { 4161 GemmMicrokernelTester() 4162 .mr(6) 4163 .nr(16) 4164 .kr(1) 4165 .sr(1) 4166 .m(6) 4167 .n(16) 4168 .k(k) 4169 .ks(3) 4170 .a_offset(127) 4171 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4172 } 4173 } 4174 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,zero)4175 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, zero) { 4176 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4177 for (size_t k = 1; k <= 20; k += 5) { 4178 for (uint32_t mz = 0; mz < 6; mz++) { 4179 GemmMicrokernelTester() 4180 .mr(6) 4181 .nr(16) 4182 .kr(1) 4183 .sr(1) 4184 .m(6) 4185 .n(16) 4186 .k(k) 4187 .ks(3) 4188 .a_offset(127) 4189 .zero_index(mz) 4190 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4191 } 4192 } 4193 } 4194 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,qmin)4195 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, qmin) { 4196 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4197 GemmMicrokernelTester() 4198 .mr(6) 4199 .nr(16) 4200 .kr(1) 4201 .sr(1) 4202 .m(6) 4203 .n(16) 4204 .k(4) 4205 .qmin(128) 4206 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4207 } 4208 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,qmax)4209 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, qmax) { 4210 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4211 GemmMicrokernelTester() 4212 .mr(6) 4213 .nr(16) 4214 .kr(1) 4215 .sr(1) 4216 .m(6) 4217 .n(16) 4218 .k(4) 4219 .qmax(128) 4220 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4221 } 4222 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64,strided_cm)4223 TEST(F16_IGEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD64, strided_cm) { 4224 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4225 GemmMicrokernelTester() 4226 .mr(6) 4227 .nr(16) 4228 .kr(1) 4229 .sr(1) 4230 .m(6) 4231 .n(16) 4232 .k(4) 4233 .cm_stride(19) 4234 .Test(xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4235 } 4236 #endif // XNN_ENABLE_ARM_FP16 && XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 4237 4238 4239 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4)4240 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4) { 4241 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4242 GemmMicrokernelTester() 4243 .mr(1) 4244 .nr(8) 4245 .kr(1) 4246 .sr(1) 4247 .m(1) 4248 .n(8) 4249 .k(4) 4250 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4251 } 4252 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cn)4253 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cn) { 4254 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4255 GemmMicrokernelTester() 4256 .mr(1) 4257 .nr(8) 4258 .kr(1) 4259 .sr(1) 4260 .m(1) 4261 .n(8) 4262 .k(4) 4263 .cn_stride(11) 4264 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4265 } 4266 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile)4267 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 4268 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4269 for (uint32_t n = 1; n <= 8; n++) { 4270 for (uint32_t m = 1; m <= 1; m++) { 4271 GemmMicrokernelTester() 4272 .mr(1) 4273 .nr(8) 4274 .kr(1) 4275 .sr(1) 4276 .m(m) 4277 .n(n) 4278 .k(4) 4279 .iterations(1) 4280 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4281 } 4282 } 4283 } 4284 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)4285 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 4286 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4287 for (uint32_t m = 1; m <= 1; m++) { 4288 GemmMicrokernelTester() 4289 .mr(1) 4290 .nr(8) 4291 .kr(1) 4292 .sr(1) 4293 .m(m) 4294 .n(8) 4295 .k(4) 4296 .iterations(1) 4297 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4298 } 4299 } 4300 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)4301 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 4302 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4303 for (uint32_t n = 1; n <= 8; n++) { 4304 GemmMicrokernelTester() 4305 .mr(1) 4306 .nr(8) 4307 .kr(1) 4308 .sr(1) 4309 .m(1) 4310 .n(n) 4311 .k(4) 4312 .iterations(1) 4313 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4314 } 4315 } 4316 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_lt_4)4317 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4) { 4318 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4319 for (size_t k = 1; k < 4; k++) { 4320 GemmMicrokernelTester() 4321 .mr(1) 4322 .nr(8) 4323 .kr(1) 4324 .sr(1) 4325 .m(1) 4326 .n(8) 4327 .k(k) 4328 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4329 } 4330 } 4331 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_lt_4_subtile)4332 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 4333 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4334 for (size_t k = 1; k < 4; k++) { 4335 for (uint32_t n = 1; n <= 8; n++) { 4336 for (uint32_t m = 1; m <= 1; m++) { 4337 GemmMicrokernelTester() 4338 .mr(1) 4339 .nr(8) 4340 .kr(1) 4341 .sr(1) 4342 .m(m) 4343 .n(n) 4344 .k(k) 4345 .iterations(1) 4346 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4347 } 4348 } 4349 } 4350 } 4351 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_gt_4)4352 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4) { 4353 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4354 for (size_t k = 5; k < 8; k++) { 4355 GemmMicrokernelTester() 4356 .mr(1) 4357 .nr(8) 4358 .kr(1) 4359 .sr(1) 4360 .m(1) 4361 .n(8) 4362 .k(k) 4363 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4364 } 4365 } 4366 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_gt_4_subtile)4367 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 4368 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4369 for (size_t k = 5; k < 8; k++) { 4370 for (uint32_t n = 1; n <= 8; n++) { 4371 for (uint32_t m = 1; m <= 1; m++) { 4372 GemmMicrokernelTester() 4373 .mr(1) 4374 .nr(8) 4375 .kr(1) 4376 .sr(1) 4377 .m(m) 4378 .n(n) 4379 .k(k) 4380 .iterations(1) 4381 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4382 } 4383 } 4384 } 4385 } 4386 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_div_4)4387 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4) { 4388 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4389 for (size_t k = 8; k <= 40; k += 4) { 4390 GemmMicrokernelTester() 4391 .mr(1) 4392 .nr(8) 4393 .kr(1) 4394 .sr(1) 4395 .m(1) 4396 .n(8) 4397 .k(k) 4398 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4399 } 4400 } 4401 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,k_div_4_subtile)4402 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 4403 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4404 for (size_t k = 8; k <= 40; k += 4) { 4405 for (uint32_t n = 1; n <= 8; n++) { 4406 for (uint32_t m = 1; m <= 1; m++) { 4407 GemmMicrokernelTester() 4408 .mr(1) 4409 .nr(8) 4410 .kr(1) 4411 .sr(1) 4412 .m(m) 4413 .n(n) 4414 .k(k) 4415 .iterations(1) 4416 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4417 } 4418 } 4419 } 4420 } 4421 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8)4422 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8) { 4423 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4424 for (uint32_t n = 9; n < 16; n++) { 4425 for (size_t k = 1; k <= 20; k += 5) { 4426 GemmMicrokernelTester() 4427 .mr(1) 4428 .nr(8) 4429 .kr(1) 4430 .sr(1) 4431 .m(1) 4432 .n(n) 4433 .k(k) 4434 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4435 } 4436 } 4437 } 4438 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)4439 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 4440 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4441 for (uint32_t n = 9; n < 16; n++) { 4442 for (size_t k = 1; k <= 20; k += 5) { 4443 GemmMicrokernelTester() 4444 .mr(1) 4445 .nr(8) 4446 .kr(1) 4447 .sr(1) 4448 .m(1) 4449 .n(n) 4450 .k(k) 4451 .cn_stride(11) 4452 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4453 } 4454 } 4455 } 4456 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_subtile)4457 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 4458 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4459 for (uint32_t n = 9; n < 16; n++) { 4460 for (size_t k = 1; k <= 20; k += 5) { 4461 for (uint32_t m = 1; m <= 1; m++) { 4462 GemmMicrokernelTester() 4463 .mr(1) 4464 .nr(8) 4465 .kr(1) 4466 .sr(1) 4467 .m(m) 4468 .n(n) 4469 .k(k) 4470 .iterations(1) 4471 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4472 } 4473 } 4474 } 4475 } 4476 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8)4477 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8) { 4478 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4479 for (uint32_t n = 16; n <= 24; n += 8) { 4480 for (size_t k = 1; k <= 20; k += 5) { 4481 GemmMicrokernelTester() 4482 .mr(1) 4483 .nr(8) 4484 .kr(1) 4485 .sr(1) 4486 .m(1) 4487 .n(n) 4488 .k(k) 4489 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4490 } 4491 } 4492 } 4493 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)4494 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 4495 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4496 for (uint32_t n = 16; n <= 24; n += 8) { 4497 for (size_t k = 1; k <= 20; k += 5) { 4498 GemmMicrokernelTester() 4499 .mr(1) 4500 .nr(8) 4501 .kr(1) 4502 .sr(1) 4503 .m(1) 4504 .n(n) 4505 .k(k) 4506 .cn_stride(11) 4507 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4508 } 4509 } 4510 } 4511 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_subtile)4512 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 4513 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4514 for (uint32_t n = 16; n <= 24; n += 8) { 4515 for (size_t k = 1; k <= 20; k += 5) { 4516 for (uint32_t m = 1; m <= 1; m++) { 4517 GemmMicrokernelTester() 4518 .mr(1) 4519 .nr(8) 4520 .kr(1) 4521 .sr(1) 4522 .m(m) 4523 .n(n) 4524 .k(k) 4525 .iterations(1) 4526 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4527 } 4528 } 4529 } 4530 } 4531 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,small_kernel)4532 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, small_kernel) { 4533 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4534 for (size_t k = 1; k <= 20; k += 5) { 4535 GemmMicrokernelTester() 4536 .mr(1) 4537 .nr(8) 4538 .kr(1) 4539 .sr(1) 4540 .m(1) 4541 .n(8) 4542 .k(k) 4543 .ks(3) 4544 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4545 } 4546 } 4547 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,small_kernel_subtile)4548 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, small_kernel_subtile) { 4549 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4550 for (size_t k = 1; k <= 20; k += 5) { 4551 for (uint32_t n = 1; n <= 8; n++) { 4552 for (uint32_t m = 1; m <= 1; m++) { 4553 GemmMicrokernelTester() 4554 .mr(1) 4555 .nr(8) 4556 .kr(1) 4557 .sr(1) 4558 .m(m) 4559 .n(n) 4560 .k(k) 4561 .ks(3) 4562 .iterations(1) 4563 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4564 } 4565 } 4566 } 4567 } 4568 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_gt_8_small_kernel)4569 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_small_kernel) { 4570 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4571 for (uint32_t n = 9; n < 16; n++) { 4572 for (size_t k = 1; k <= 20; k += 5) { 4573 GemmMicrokernelTester() 4574 .mr(1) 4575 .nr(8) 4576 .kr(1) 4577 .sr(1) 4578 .m(1) 4579 .n(n) 4580 .k(k) 4581 .ks(3) 4582 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4583 } 4584 } 4585 } 4586 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,n_div_8_small_kernel)4587 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_small_kernel) { 4588 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4589 for (uint32_t n = 16; n <= 24; n += 8) { 4590 for (size_t k = 1; k <= 20; k += 5) { 4591 GemmMicrokernelTester() 4592 .mr(1) 4593 .nr(8) 4594 .kr(1) 4595 .sr(1) 4596 .m(1) 4597 .n(n) 4598 .k(k) 4599 .ks(3) 4600 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4601 } 4602 } 4603 } 4604 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cm_subtile)4605 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 4606 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4607 for (size_t k = 1; k <= 20; k += 5) { 4608 for (uint32_t n = 1; n <= 8; n++) { 4609 for (uint32_t m = 1; m <= 1; m++) { 4610 GemmMicrokernelTester() 4611 .mr(1) 4612 .nr(8) 4613 .kr(1) 4614 .sr(1) 4615 .m(m) 4616 .n(n) 4617 .k(k) 4618 .cm_stride(11) 4619 .iterations(1) 4620 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4621 } 4622 } 4623 } 4624 } 4625 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,a_offset)4626 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, a_offset) { 4627 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4628 for (size_t k = 1; k <= 20; k += 5) { 4629 GemmMicrokernelTester() 4630 .mr(1) 4631 .nr(8) 4632 .kr(1) 4633 .sr(1) 4634 .m(1) 4635 .n(8) 4636 .k(k) 4637 .ks(3) 4638 .a_offset(23) 4639 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4640 } 4641 } 4642 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,zero)4643 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, zero) { 4644 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4645 for (size_t k = 1; k <= 20; k += 5) { 4646 for (uint32_t mz = 0; mz < 1; mz++) { 4647 GemmMicrokernelTester() 4648 .mr(1) 4649 .nr(8) 4650 .kr(1) 4651 .sr(1) 4652 .m(1) 4653 .n(8) 4654 .k(k) 4655 .ks(3) 4656 .a_offset(23) 4657 .zero_index(mz) 4658 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4659 } 4660 } 4661 } 4662 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,qmin)4663 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmin) { 4664 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4665 GemmMicrokernelTester() 4666 .mr(1) 4667 .nr(8) 4668 .kr(1) 4669 .sr(1) 4670 .m(1) 4671 .n(8) 4672 .k(4) 4673 .qmin(128) 4674 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4675 } 4676 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,qmax)4677 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmax) { 4678 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4679 GemmMicrokernelTester() 4680 .mr(1) 4681 .nr(8) 4682 .kr(1) 4683 .sr(1) 4684 .m(1) 4685 .n(8) 4686 .k(4) 4687 .qmax(128) 4688 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4689 } 4690 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64,strided_cm)4691 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm) { 4692 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4693 GemmMicrokernelTester() 4694 .mr(1) 4695 .nr(8) 4696 .kr(1) 4697 .sr(1) 4698 .m(1) 4699 .n(8) 4700 .k(4) 4701 .cm_stride(11) 4702 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4703 } 4704 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 4705 4706 4707 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4)4708 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4) { 4709 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4710 GemmMicrokernelTester() 4711 .mr(1) 4712 .nr(16) 4713 .kr(1) 4714 .sr(1) 4715 .m(1) 4716 .n(16) 4717 .k(4) 4718 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4719 } 4720 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cn)4721 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cn) { 4722 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4723 GemmMicrokernelTester() 4724 .mr(1) 4725 .nr(16) 4726 .kr(1) 4727 .sr(1) 4728 .m(1) 4729 .n(16) 4730 .k(4) 4731 .cn_stride(19) 4732 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4733 } 4734 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile)4735 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 4736 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4737 for (uint32_t n = 1; n <= 16; n++) { 4738 for (uint32_t m = 1; m <= 1; m++) { 4739 GemmMicrokernelTester() 4740 .mr(1) 4741 .nr(16) 4742 .kr(1) 4743 .sr(1) 4744 .m(m) 4745 .n(n) 4746 .k(4) 4747 .iterations(1) 4748 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4749 } 4750 } 4751 } 4752 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)4753 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 4754 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4755 for (uint32_t m = 1; m <= 1; m++) { 4756 GemmMicrokernelTester() 4757 .mr(1) 4758 .nr(16) 4759 .kr(1) 4760 .sr(1) 4761 .m(m) 4762 .n(16) 4763 .k(4) 4764 .iterations(1) 4765 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4766 } 4767 } 4768 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)4769 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 4770 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4771 for (uint32_t n = 1; n <= 16; n++) { 4772 GemmMicrokernelTester() 4773 .mr(1) 4774 .nr(16) 4775 .kr(1) 4776 .sr(1) 4777 .m(1) 4778 .n(n) 4779 .k(4) 4780 .iterations(1) 4781 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4782 } 4783 } 4784 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_lt_4)4785 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4) { 4786 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4787 for (size_t k = 1; k < 4; k++) { 4788 GemmMicrokernelTester() 4789 .mr(1) 4790 .nr(16) 4791 .kr(1) 4792 .sr(1) 4793 .m(1) 4794 .n(16) 4795 .k(k) 4796 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4797 } 4798 } 4799 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_lt_4_subtile)4800 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 4801 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4802 for (size_t k = 1; k < 4; k++) { 4803 for (uint32_t n = 1; n <= 16; n++) { 4804 for (uint32_t m = 1; m <= 1; m++) { 4805 GemmMicrokernelTester() 4806 .mr(1) 4807 .nr(16) 4808 .kr(1) 4809 .sr(1) 4810 .m(m) 4811 .n(n) 4812 .k(k) 4813 .iterations(1) 4814 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4815 } 4816 } 4817 } 4818 } 4819 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_gt_4)4820 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4) { 4821 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4822 for (size_t k = 5; k < 8; k++) { 4823 GemmMicrokernelTester() 4824 .mr(1) 4825 .nr(16) 4826 .kr(1) 4827 .sr(1) 4828 .m(1) 4829 .n(16) 4830 .k(k) 4831 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4832 } 4833 } 4834 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_gt_4_subtile)4835 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 4836 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4837 for (size_t k = 5; k < 8; k++) { 4838 for (uint32_t n = 1; n <= 16; n++) { 4839 for (uint32_t m = 1; m <= 1; m++) { 4840 GemmMicrokernelTester() 4841 .mr(1) 4842 .nr(16) 4843 .kr(1) 4844 .sr(1) 4845 .m(m) 4846 .n(n) 4847 .k(k) 4848 .iterations(1) 4849 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4850 } 4851 } 4852 } 4853 } 4854 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_div_4)4855 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4) { 4856 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4857 for (size_t k = 8; k <= 40; k += 4) { 4858 GemmMicrokernelTester() 4859 .mr(1) 4860 .nr(16) 4861 .kr(1) 4862 .sr(1) 4863 .m(1) 4864 .n(16) 4865 .k(k) 4866 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4867 } 4868 } 4869 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,k_div_4_subtile)4870 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 4871 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4872 for (size_t k = 8; k <= 40; k += 4) { 4873 for (uint32_t n = 1; n <= 16; n++) { 4874 for (uint32_t m = 1; m <= 1; m++) { 4875 GemmMicrokernelTester() 4876 .mr(1) 4877 .nr(16) 4878 .kr(1) 4879 .sr(1) 4880 .m(m) 4881 .n(n) 4882 .k(k) 4883 .iterations(1) 4884 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4885 } 4886 } 4887 } 4888 } 4889 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16)4890 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16) { 4891 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4892 for (uint32_t n = 17; n < 32; n++) { 4893 for (size_t k = 1; k <= 20; k += 5) { 4894 GemmMicrokernelTester() 4895 .mr(1) 4896 .nr(16) 4897 .kr(1) 4898 .sr(1) 4899 .m(1) 4900 .n(n) 4901 .k(k) 4902 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4903 } 4904 } 4905 } 4906 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)4907 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 4908 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4909 for (uint32_t n = 17; n < 32; n++) { 4910 for (size_t k = 1; k <= 20; k += 5) { 4911 GemmMicrokernelTester() 4912 .mr(1) 4913 .nr(16) 4914 .kr(1) 4915 .sr(1) 4916 .m(1) 4917 .n(n) 4918 .k(k) 4919 .cn_stride(19) 4920 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4921 } 4922 } 4923 } 4924 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_subtile)4925 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 4926 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4927 for (uint32_t n = 17; n < 32; n++) { 4928 for (size_t k = 1; k <= 20; k += 5) { 4929 for (uint32_t m = 1; m <= 1; m++) { 4930 GemmMicrokernelTester() 4931 .mr(1) 4932 .nr(16) 4933 .kr(1) 4934 .sr(1) 4935 .m(m) 4936 .n(n) 4937 .k(k) 4938 .iterations(1) 4939 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4940 } 4941 } 4942 } 4943 } 4944 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16)4945 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16) { 4946 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4947 for (uint32_t n = 32; n <= 48; n += 16) { 4948 for (size_t k = 1; k <= 20; k += 5) { 4949 GemmMicrokernelTester() 4950 .mr(1) 4951 .nr(16) 4952 .kr(1) 4953 .sr(1) 4954 .m(1) 4955 .n(n) 4956 .k(k) 4957 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4958 } 4959 } 4960 } 4961 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)4962 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 4963 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4964 for (uint32_t n = 32; n <= 48; n += 16) { 4965 for (size_t k = 1; k <= 20; k += 5) { 4966 GemmMicrokernelTester() 4967 .mr(1) 4968 .nr(16) 4969 .kr(1) 4970 .sr(1) 4971 .m(1) 4972 .n(n) 4973 .k(k) 4974 .cn_stride(19) 4975 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4976 } 4977 } 4978 } 4979 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_subtile)4980 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 4981 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 4982 for (uint32_t n = 32; n <= 48; n += 16) { 4983 for (size_t k = 1; k <= 20; k += 5) { 4984 for (uint32_t m = 1; m <= 1; m++) { 4985 GemmMicrokernelTester() 4986 .mr(1) 4987 .nr(16) 4988 .kr(1) 4989 .sr(1) 4990 .m(m) 4991 .n(n) 4992 .k(k) 4993 .iterations(1) 4994 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 4995 } 4996 } 4997 } 4998 } 4999 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,small_kernel)5000 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, small_kernel) { 5001 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5002 for (size_t k = 1; k <= 20; k += 5) { 5003 GemmMicrokernelTester() 5004 .mr(1) 5005 .nr(16) 5006 .kr(1) 5007 .sr(1) 5008 .m(1) 5009 .n(16) 5010 .k(k) 5011 .ks(3) 5012 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5013 } 5014 } 5015 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,small_kernel_subtile)5016 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, small_kernel_subtile) { 5017 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5018 for (size_t k = 1; k <= 20; k += 5) { 5019 for (uint32_t n = 1; n <= 16; n++) { 5020 for (uint32_t m = 1; m <= 1; m++) { 5021 GemmMicrokernelTester() 5022 .mr(1) 5023 .nr(16) 5024 .kr(1) 5025 .sr(1) 5026 .m(m) 5027 .n(n) 5028 .k(k) 5029 .ks(3) 5030 .iterations(1) 5031 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5032 } 5033 } 5034 } 5035 } 5036 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_gt_16_small_kernel)5037 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_small_kernel) { 5038 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5039 for (uint32_t n = 17; n < 32; n++) { 5040 for (size_t k = 1; k <= 20; k += 5) { 5041 GemmMicrokernelTester() 5042 .mr(1) 5043 .nr(16) 5044 .kr(1) 5045 .sr(1) 5046 .m(1) 5047 .n(n) 5048 .k(k) 5049 .ks(3) 5050 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5051 } 5052 } 5053 } 5054 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,n_div_16_small_kernel)5055 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_small_kernel) { 5056 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5057 for (uint32_t n = 32; n <= 48; n += 16) { 5058 for (size_t k = 1; k <= 20; k += 5) { 5059 GemmMicrokernelTester() 5060 .mr(1) 5061 .nr(16) 5062 .kr(1) 5063 .sr(1) 5064 .m(1) 5065 .n(n) 5066 .k(k) 5067 .ks(3) 5068 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5069 } 5070 } 5071 } 5072 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cm_subtile)5073 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 5074 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5075 for (size_t k = 1; k <= 20; k += 5) { 5076 for (uint32_t n = 1; n <= 16; n++) { 5077 for (uint32_t m = 1; m <= 1; m++) { 5078 GemmMicrokernelTester() 5079 .mr(1) 5080 .nr(16) 5081 .kr(1) 5082 .sr(1) 5083 .m(m) 5084 .n(n) 5085 .k(k) 5086 .cm_stride(19) 5087 .iterations(1) 5088 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5089 } 5090 } 5091 } 5092 } 5093 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,a_offset)5094 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, a_offset) { 5095 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5096 for (size_t k = 1; k <= 20; k += 5) { 5097 GemmMicrokernelTester() 5098 .mr(1) 5099 .nr(16) 5100 .kr(1) 5101 .sr(1) 5102 .m(1) 5103 .n(16) 5104 .k(k) 5105 .ks(3) 5106 .a_offset(23) 5107 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5108 } 5109 } 5110 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,zero)5111 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, zero) { 5112 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5113 for (size_t k = 1; k <= 20; k += 5) { 5114 for (uint32_t mz = 0; mz < 1; mz++) { 5115 GemmMicrokernelTester() 5116 .mr(1) 5117 .nr(16) 5118 .kr(1) 5119 .sr(1) 5120 .m(1) 5121 .n(16) 5122 .k(k) 5123 .ks(3) 5124 .a_offset(23) 5125 .zero_index(mz) 5126 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5127 } 5128 } 5129 } 5130 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,qmin)5131 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmin) { 5132 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5133 GemmMicrokernelTester() 5134 .mr(1) 5135 .nr(16) 5136 .kr(1) 5137 .sr(1) 5138 .m(1) 5139 .n(16) 5140 .k(4) 5141 .qmin(128) 5142 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5143 } 5144 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,qmax)5145 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmax) { 5146 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5147 GemmMicrokernelTester() 5148 .mr(1) 5149 .nr(16) 5150 .kr(1) 5151 .sr(1) 5152 .m(1) 5153 .n(16) 5154 .k(4) 5155 .qmax(128) 5156 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5157 } 5158 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64,strided_cm)5159 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm) { 5160 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5161 GemmMicrokernelTester() 5162 .mr(1) 5163 .nr(16) 5164 .kr(1) 5165 .sr(1) 5166 .m(1) 5167 .n(16) 5168 .k(4) 5169 .cm_stride(19) 5170 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5171 } 5172 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 5173 5174 5175 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4)5176 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4) { 5177 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5178 GemmMicrokernelTester() 5179 .mr(4) 5180 .nr(8) 5181 .kr(1) 5182 .sr(1) 5183 .m(4) 5184 .n(8) 5185 .k(4) 5186 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5187 } 5188 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cn)5189 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cn) { 5190 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5191 GemmMicrokernelTester() 5192 .mr(4) 5193 .nr(8) 5194 .kr(1) 5195 .sr(1) 5196 .m(4) 5197 .n(8) 5198 .k(4) 5199 .cn_stride(11) 5200 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5201 } 5202 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile)5203 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 5204 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5205 for (uint32_t n = 1; n <= 8; n++) { 5206 for (uint32_t m = 1; m <= 4; m++) { 5207 GemmMicrokernelTester() 5208 .mr(4) 5209 .nr(8) 5210 .kr(1) 5211 .sr(1) 5212 .m(m) 5213 .n(n) 5214 .k(4) 5215 .iterations(1) 5216 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5217 } 5218 } 5219 } 5220 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)5221 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 5222 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5223 for (uint32_t m = 1; m <= 4; m++) { 5224 GemmMicrokernelTester() 5225 .mr(4) 5226 .nr(8) 5227 .kr(1) 5228 .sr(1) 5229 .m(m) 5230 .n(8) 5231 .k(4) 5232 .iterations(1) 5233 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5234 } 5235 } 5236 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)5237 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 5238 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5239 for (uint32_t n = 1; n <= 8; n++) { 5240 GemmMicrokernelTester() 5241 .mr(4) 5242 .nr(8) 5243 .kr(1) 5244 .sr(1) 5245 .m(4) 5246 .n(n) 5247 .k(4) 5248 .iterations(1) 5249 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5250 } 5251 } 5252 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_lt_4)5253 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4) { 5254 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5255 for (size_t k = 1; k < 4; k++) { 5256 GemmMicrokernelTester() 5257 .mr(4) 5258 .nr(8) 5259 .kr(1) 5260 .sr(1) 5261 .m(4) 5262 .n(8) 5263 .k(k) 5264 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5265 } 5266 } 5267 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_lt_4_subtile)5268 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 5269 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5270 for (size_t k = 1; k < 4; k++) { 5271 for (uint32_t n = 1; n <= 8; n++) { 5272 for (uint32_t m = 1; m <= 4; m++) { 5273 GemmMicrokernelTester() 5274 .mr(4) 5275 .nr(8) 5276 .kr(1) 5277 .sr(1) 5278 .m(m) 5279 .n(n) 5280 .k(k) 5281 .iterations(1) 5282 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5283 } 5284 } 5285 } 5286 } 5287 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_gt_4)5288 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4) { 5289 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5290 for (size_t k = 5; k < 8; k++) { 5291 GemmMicrokernelTester() 5292 .mr(4) 5293 .nr(8) 5294 .kr(1) 5295 .sr(1) 5296 .m(4) 5297 .n(8) 5298 .k(k) 5299 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5300 } 5301 } 5302 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_gt_4_subtile)5303 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 5304 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5305 for (size_t k = 5; k < 8; k++) { 5306 for (uint32_t n = 1; n <= 8; n++) { 5307 for (uint32_t m = 1; m <= 4; m++) { 5308 GemmMicrokernelTester() 5309 .mr(4) 5310 .nr(8) 5311 .kr(1) 5312 .sr(1) 5313 .m(m) 5314 .n(n) 5315 .k(k) 5316 .iterations(1) 5317 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5318 } 5319 } 5320 } 5321 } 5322 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_div_4)5323 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4) { 5324 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5325 for (size_t k = 8; k <= 40; k += 4) { 5326 GemmMicrokernelTester() 5327 .mr(4) 5328 .nr(8) 5329 .kr(1) 5330 .sr(1) 5331 .m(4) 5332 .n(8) 5333 .k(k) 5334 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5335 } 5336 } 5337 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,k_div_4_subtile)5338 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 5339 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5340 for (size_t k = 8; k <= 40; k += 4) { 5341 for (uint32_t n = 1; n <= 8; n++) { 5342 for (uint32_t m = 1; m <= 4; m++) { 5343 GemmMicrokernelTester() 5344 .mr(4) 5345 .nr(8) 5346 .kr(1) 5347 .sr(1) 5348 .m(m) 5349 .n(n) 5350 .k(k) 5351 .iterations(1) 5352 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5353 } 5354 } 5355 } 5356 } 5357 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8)5358 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8) { 5359 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5360 for (uint32_t n = 9; n < 16; n++) { 5361 for (size_t k = 1; k <= 20; k += 5) { 5362 GemmMicrokernelTester() 5363 .mr(4) 5364 .nr(8) 5365 .kr(1) 5366 .sr(1) 5367 .m(4) 5368 .n(n) 5369 .k(k) 5370 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5371 } 5372 } 5373 } 5374 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)5375 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 5376 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5377 for (uint32_t n = 9; n < 16; n++) { 5378 for (size_t k = 1; k <= 20; k += 5) { 5379 GemmMicrokernelTester() 5380 .mr(4) 5381 .nr(8) 5382 .kr(1) 5383 .sr(1) 5384 .m(4) 5385 .n(n) 5386 .k(k) 5387 .cn_stride(11) 5388 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5389 } 5390 } 5391 } 5392 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_subtile)5393 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 5394 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5395 for (uint32_t n = 9; n < 16; n++) { 5396 for (size_t k = 1; k <= 20; k += 5) { 5397 for (uint32_t m = 1; m <= 4; m++) { 5398 GemmMicrokernelTester() 5399 .mr(4) 5400 .nr(8) 5401 .kr(1) 5402 .sr(1) 5403 .m(m) 5404 .n(n) 5405 .k(k) 5406 .iterations(1) 5407 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5408 } 5409 } 5410 } 5411 } 5412 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8)5413 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8) { 5414 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5415 for (uint32_t n = 16; n <= 24; n += 8) { 5416 for (size_t k = 1; k <= 20; k += 5) { 5417 GemmMicrokernelTester() 5418 .mr(4) 5419 .nr(8) 5420 .kr(1) 5421 .sr(1) 5422 .m(4) 5423 .n(n) 5424 .k(k) 5425 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5426 } 5427 } 5428 } 5429 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)5430 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 5431 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5432 for (uint32_t n = 16; n <= 24; n += 8) { 5433 for (size_t k = 1; k <= 20; k += 5) { 5434 GemmMicrokernelTester() 5435 .mr(4) 5436 .nr(8) 5437 .kr(1) 5438 .sr(1) 5439 .m(4) 5440 .n(n) 5441 .k(k) 5442 .cn_stride(11) 5443 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5444 } 5445 } 5446 } 5447 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_subtile)5448 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 5449 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5450 for (uint32_t n = 16; n <= 24; n += 8) { 5451 for (size_t k = 1; k <= 20; k += 5) { 5452 for (uint32_t m = 1; m <= 4; m++) { 5453 GemmMicrokernelTester() 5454 .mr(4) 5455 .nr(8) 5456 .kr(1) 5457 .sr(1) 5458 .m(m) 5459 .n(n) 5460 .k(k) 5461 .iterations(1) 5462 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5463 } 5464 } 5465 } 5466 } 5467 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,small_kernel)5468 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, small_kernel) { 5469 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5470 for (size_t k = 1; k <= 20; k += 5) { 5471 GemmMicrokernelTester() 5472 .mr(4) 5473 .nr(8) 5474 .kr(1) 5475 .sr(1) 5476 .m(4) 5477 .n(8) 5478 .k(k) 5479 .ks(3) 5480 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5481 } 5482 } 5483 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,small_kernel_subtile)5484 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, small_kernel_subtile) { 5485 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5486 for (size_t k = 1; k <= 20; k += 5) { 5487 for (uint32_t n = 1; n <= 8; n++) { 5488 for (uint32_t m = 1; m <= 4; m++) { 5489 GemmMicrokernelTester() 5490 .mr(4) 5491 .nr(8) 5492 .kr(1) 5493 .sr(1) 5494 .m(m) 5495 .n(n) 5496 .k(k) 5497 .ks(3) 5498 .iterations(1) 5499 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5500 } 5501 } 5502 } 5503 } 5504 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_gt_8_small_kernel)5505 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_small_kernel) { 5506 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5507 for (uint32_t n = 9; n < 16; n++) { 5508 for (size_t k = 1; k <= 20; k += 5) { 5509 GemmMicrokernelTester() 5510 .mr(4) 5511 .nr(8) 5512 .kr(1) 5513 .sr(1) 5514 .m(4) 5515 .n(n) 5516 .k(k) 5517 .ks(3) 5518 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5519 } 5520 } 5521 } 5522 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,n_div_8_small_kernel)5523 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_small_kernel) { 5524 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5525 for (uint32_t n = 16; n <= 24; n += 8) { 5526 for (size_t k = 1; k <= 20; k += 5) { 5527 GemmMicrokernelTester() 5528 .mr(4) 5529 .nr(8) 5530 .kr(1) 5531 .sr(1) 5532 .m(4) 5533 .n(n) 5534 .k(k) 5535 .ks(3) 5536 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5537 } 5538 } 5539 } 5540 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cm_subtile)5541 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 5542 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5543 for (size_t k = 1; k <= 20; k += 5) { 5544 for (uint32_t n = 1; n <= 8; n++) { 5545 for (uint32_t m = 1; m <= 4; m++) { 5546 GemmMicrokernelTester() 5547 .mr(4) 5548 .nr(8) 5549 .kr(1) 5550 .sr(1) 5551 .m(m) 5552 .n(n) 5553 .k(k) 5554 .cm_stride(11) 5555 .iterations(1) 5556 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5557 } 5558 } 5559 } 5560 } 5561 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,a_offset)5562 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, a_offset) { 5563 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5564 for (size_t k = 1; k <= 20; k += 5) { 5565 GemmMicrokernelTester() 5566 .mr(4) 5567 .nr(8) 5568 .kr(1) 5569 .sr(1) 5570 .m(4) 5571 .n(8) 5572 .k(k) 5573 .ks(3) 5574 .a_offset(83) 5575 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5576 } 5577 } 5578 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,zero)5579 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, zero) { 5580 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5581 for (size_t k = 1; k <= 20; k += 5) { 5582 for (uint32_t mz = 0; mz < 4; mz++) { 5583 GemmMicrokernelTester() 5584 .mr(4) 5585 .nr(8) 5586 .kr(1) 5587 .sr(1) 5588 .m(4) 5589 .n(8) 5590 .k(k) 5591 .ks(3) 5592 .a_offset(83) 5593 .zero_index(mz) 5594 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5595 } 5596 } 5597 } 5598 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,qmin)5599 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmin) { 5600 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5601 GemmMicrokernelTester() 5602 .mr(4) 5603 .nr(8) 5604 .kr(1) 5605 .sr(1) 5606 .m(4) 5607 .n(8) 5608 .k(4) 5609 .qmin(128) 5610 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5611 } 5612 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,qmax)5613 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmax) { 5614 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5615 GemmMicrokernelTester() 5616 .mr(4) 5617 .nr(8) 5618 .kr(1) 5619 .sr(1) 5620 .m(4) 5621 .n(8) 5622 .k(4) 5623 .qmax(128) 5624 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5625 } 5626 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64,strided_cm)5627 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm) { 5628 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5629 GemmMicrokernelTester() 5630 .mr(4) 5631 .nr(8) 5632 .kr(1) 5633 .sr(1) 5634 .m(4) 5635 .n(8) 5636 .k(4) 5637 .cm_stride(11) 5638 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5639 } 5640 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 5641 5642 5643 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4)5644 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4) { 5645 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5646 GemmMicrokernelTester() 5647 .mr(4) 5648 .nr(16) 5649 .kr(1) 5650 .sr(1) 5651 .m(4) 5652 .n(16) 5653 .k(4) 5654 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5655 } 5656 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cn)5657 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cn) { 5658 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5659 GemmMicrokernelTester() 5660 .mr(4) 5661 .nr(16) 5662 .kr(1) 5663 .sr(1) 5664 .m(4) 5665 .n(16) 5666 .k(4) 5667 .cn_stride(19) 5668 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5669 } 5670 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile)5671 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 5672 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5673 for (uint32_t n = 1; n <= 16; n++) { 5674 for (uint32_t m = 1; m <= 4; m++) { 5675 GemmMicrokernelTester() 5676 .mr(4) 5677 .nr(16) 5678 .kr(1) 5679 .sr(1) 5680 .m(m) 5681 .n(n) 5682 .k(4) 5683 .iterations(1) 5684 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5685 } 5686 } 5687 } 5688 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)5689 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 5690 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5691 for (uint32_t m = 1; m <= 4; m++) { 5692 GemmMicrokernelTester() 5693 .mr(4) 5694 .nr(16) 5695 .kr(1) 5696 .sr(1) 5697 .m(m) 5698 .n(16) 5699 .k(4) 5700 .iterations(1) 5701 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5702 } 5703 } 5704 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)5705 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 5706 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5707 for (uint32_t n = 1; n <= 16; n++) { 5708 GemmMicrokernelTester() 5709 .mr(4) 5710 .nr(16) 5711 .kr(1) 5712 .sr(1) 5713 .m(4) 5714 .n(n) 5715 .k(4) 5716 .iterations(1) 5717 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5718 } 5719 } 5720 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_lt_4)5721 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4) { 5722 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5723 for (size_t k = 1; k < 4; k++) { 5724 GemmMicrokernelTester() 5725 .mr(4) 5726 .nr(16) 5727 .kr(1) 5728 .sr(1) 5729 .m(4) 5730 .n(16) 5731 .k(k) 5732 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5733 } 5734 } 5735 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_lt_4_subtile)5736 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 5737 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5738 for (size_t k = 1; k < 4; k++) { 5739 for (uint32_t n = 1; n <= 16; n++) { 5740 for (uint32_t m = 1; m <= 4; m++) { 5741 GemmMicrokernelTester() 5742 .mr(4) 5743 .nr(16) 5744 .kr(1) 5745 .sr(1) 5746 .m(m) 5747 .n(n) 5748 .k(k) 5749 .iterations(1) 5750 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5751 } 5752 } 5753 } 5754 } 5755 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_gt_4)5756 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4) { 5757 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5758 for (size_t k = 5; k < 8; k++) { 5759 GemmMicrokernelTester() 5760 .mr(4) 5761 .nr(16) 5762 .kr(1) 5763 .sr(1) 5764 .m(4) 5765 .n(16) 5766 .k(k) 5767 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5768 } 5769 } 5770 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_gt_4_subtile)5771 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 5772 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5773 for (size_t k = 5; k < 8; k++) { 5774 for (uint32_t n = 1; n <= 16; n++) { 5775 for (uint32_t m = 1; m <= 4; m++) { 5776 GemmMicrokernelTester() 5777 .mr(4) 5778 .nr(16) 5779 .kr(1) 5780 .sr(1) 5781 .m(m) 5782 .n(n) 5783 .k(k) 5784 .iterations(1) 5785 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5786 } 5787 } 5788 } 5789 } 5790 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_div_4)5791 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4) { 5792 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5793 for (size_t k = 8; k <= 40; k += 4) { 5794 GemmMicrokernelTester() 5795 .mr(4) 5796 .nr(16) 5797 .kr(1) 5798 .sr(1) 5799 .m(4) 5800 .n(16) 5801 .k(k) 5802 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5803 } 5804 } 5805 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,k_div_4_subtile)5806 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 5807 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5808 for (size_t k = 8; k <= 40; k += 4) { 5809 for (uint32_t n = 1; n <= 16; n++) { 5810 for (uint32_t m = 1; m <= 4; m++) { 5811 GemmMicrokernelTester() 5812 .mr(4) 5813 .nr(16) 5814 .kr(1) 5815 .sr(1) 5816 .m(m) 5817 .n(n) 5818 .k(k) 5819 .iterations(1) 5820 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5821 } 5822 } 5823 } 5824 } 5825 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16)5826 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16) { 5827 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5828 for (uint32_t n = 17; n < 32; n++) { 5829 for (size_t k = 1; k <= 20; k += 5) { 5830 GemmMicrokernelTester() 5831 .mr(4) 5832 .nr(16) 5833 .kr(1) 5834 .sr(1) 5835 .m(4) 5836 .n(n) 5837 .k(k) 5838 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5839 } 5840 } 5841 } 5842 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)5843 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 5844 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5845 for (uint32_t n = 17; n < 32; n++) { 5846 for (size_t k = 1; k <= 20; k += 5) { 5847 GemmMicrokernelTester() 5848 .mr(4) 5849 .nr(16) 5850 .kr(1) 5851 .sr(1) 5852 .m(4) 5853 .n(n) 5854 .k(k) 5855 .cn_stride(19) 5856 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5857 } 5858 } 5859 } 5860 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_subtile)5861 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 5862 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5863 for (uint32_t n = 17; n < 32; n++) { 5864 for (size_t k = 1; k <= 20; k += 5) { 5865 for (uint32_t m = 1; m <= 4; m++) { 5866 GemmMicrokernelTester() 5867 .mr(4) 5868 .nr(16) 5869 .kr(1) 5870 .sr(1) 5871 .m(m) 5872 .n(n) 5873 .k(k) 5874 .iterations(1) 5875 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5876 } 5877 } 5878 } 5879 } 5880 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16)5881 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16) { 5882 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5883 for (uint32_t n = 32; n <= 48; n += 16) { 5884 for (size_t k = 1; k <= 20; k += 5) { 5885 GemmMicrokernelTester() 5886 .mr(4) 5887 .nr(16) 5888 .kr(1) 5889 .sr(1) 5890 .m(4) 5891 .n(n) 5892 .k(k) 5893 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5894 } 5895 } 5896 } 5897 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)5898 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 5899 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5900 for (uint32_t n = 32; n <= 48; n += 16) { 5901 for (size_t k = 1; k <= 20; k += 5) { 5902 GemmMicrokernelTester() 5903 .mr(4) 5904 .nr(16) 5905 .kr(1) 5906 .sr(1) 5907 .m(4) 5908 .n(n) 5909 .k(k) 5910 .cn_stride(19) 5911 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5912 } 5913 } 5914 } 5915 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_subtile)5916 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 5917 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5918 for (uint32_t n = 32; n <= 48; n += 16) { 5919 for (size_t k = 1; k <= 20; k += 5) { 5920 for (uint32_t m = 1; m <= 4; m++) { 5921 GemmMicrokernelTester() 5922 .mr(4) 5923 .nr(16) 5924 .kr(1) 5925 .sr(1) 5926 .m(m) 5927 .n(n) 5928 .k(k) 5929 .iterations(1) 5930 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5931 } 5932 } 5933 } 5934 } 5935 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,small_kernel)5936 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, small_kernel) { 5937 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5938 for (size_t k = 1; k <= 20; k += 5) { 5939 GemmMicrokernelTester() 5940 .mr(4) 5941 .nr(16) 5942 .kr(1) 5943 .sr(1) 5944 .m(4) 5945 .n(16) 5946 .k(k) 5947 .ks(3) 5948 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5949 } 5950 } 5951 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,small_kernel_subtile)5952 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, small_kernel_subtile) { 5953 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5954 for (size_t k = 1; k <= 20; k += 5) { 5955 for (uint32_t n = 1; n <= 16; n++) { 5956 for (uint32_t m = 1; m <= 4; m++) { 5957 GemmMicrokernelTester() 5958 .mr(4) 5959 .nr(16) 5960 .kr(1) 5961 .sr(1) 5962 .m(m) 5963 .n(n) 5964 .k(k) 5965 .ks(3) 5966 .iterations(1) 5967 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5968 } 5969 } 5970 } 5971 } 5972 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_gt_16_small_kernel)5973 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_small_kernel) { 5974 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5975 for (uint32_t n = 17; n < 32; n++) { 5976 for (size_t k = 1; k <= 20; k += 5) { 5977 GemmMicrokernelTester() 5978 .mr(4) 5979 .nr(16) 5980 .kr(1) 5981 .sr(1) 5982 .m(4) 5983 .n(n) 5984 .k(k) 5985 .ks(3) 5986 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 5987 } 5988 } 5989 } 5990 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,n_div_16_small_kernel)5991 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_small_kernel) { 5992 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 5993 for (uint32_t n = 32; n <= 48; n += 16) { 5994 for (size_t k = 1; k <= 20; k += 5) { 5995 GemmMicrokernelTester() 5996 .mr(4) 5997 .nr(16) 5998 .kr(1) 5999 .sr(1) 6000 .m(4) 6001 .n(n) 6002 .k(k) 6003 .ks(3) 6004 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6005 } 6006 } 6007 } 6008 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cm_subtile)6009 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 6010 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6011 for (size_t k = 1; k <= 20; k += 5) { 6012 for (uint32_t n = 1; n <= 16; n++) { 6013 for (uint32_t m = 1; m <= 4; m++) { 6014 GemmMicrokernelTester() 6015 .mr(4) 6016 .nr(16) 6017 .kr(1) 6018 .sr(1) 6019 .m(m) 6020 .n(n) 6021 .k(k) 6022 .cm_stride(19) 6023 .iterations(1) 6024 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6025 } 6026 } 6027 } 6028 } 6029 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,a_offset)6030 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, a_offset) { 6031 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6032 for (size_t k = 1; k <= 20; k += 5) { 6033 GemmMicrokernelTester() 6034 .mr(4) 6035 .nr(16) 6036 .kr(1) 6037 .sr(1) 6038 .m(4) 6039 .n(16) 6040 .k(k) 6041 .ks(3) 6042 .a_offset(83) 6043 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6044 } 6045 } 6046 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,zero)6047 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, zero) { 6048 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6049 for (size_t k = 1; k <= 20; k += 5) { 6050 for (uint32_t mz = 0; mz < 4; mz++) { 6051 GemmMicrokernelTester() 6052 .mr(4) 6053 .nr(16) 6054 .kr(1) 6055 .sr(1) 6056 .m(4) 6057 .n(16) 6058 .k(k) 6059 .ks(3) 6060 .a_offset(83) 6061 .zero_index(mz) 6062 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6063 } 6064 } 6065 } 6066 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,qmin)6067 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmin) { 6068 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6069 GemmMicrokernelTester() 6070 .mr(4) 6071 .nr(16) 6072 .kr(1) 6073 .sr(1) 6074 .m(4) 6075 .n(16) 6076 .k(4) 6077 .qmin(128) 6078 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6079 } 6080 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,qmax)6081 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmax) { 6082 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6083 GemmMicrokernelTester() 6084 .mr(4) 6085 .nr(16) 6086 .kr(1) 6087 .sr(1) 6088 .m(4) 6089 .n(16) 6090 .k(4) 6091 .qmax(128) 6092 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6093 } 6094 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64,strided_cm)6095 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm) { 6096 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6097 GemmMicrokernelTester() 6098 .mr(4) 6099 .nr(16) 6100 .kr(1) 6101 .sr(1) 6102 .m(4) 6103 .n(16) 6104 .k(4) 6105 .cm_stride(19) 6106 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6107 } 6108 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 6109 6110 6111 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4)6112 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4) { 6113 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6114 GemmMicrokernelTester() 6115 .mr(6) 6116 .nr(8) 6117 .kr(1) 6118 .sr(1) 6119 .m(6) 6120 .n(8) 6121 .k(4) 6122 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6123 } 6124 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cn)6125 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cn) { 6126 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6127 GemmMicrokernelTester() 6128 .mr(6) 6129 .nr(8) 6130 .kr(1) 6131 .sr(1) 6132 .m(6) 6133 .n(8) 6134 .k(4) 6135 .cn_stride(11) 6136 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6137 } 6138 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile)6139 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 6140 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6141 for (uint32_t n = 1; n <= 8; n++) { 6142 for (uint32_t m = 1; m <= 6; m++) { 6143 GemmMicrokernelTester() 6144 .mr(6) 6145 .nr(8) 6146 .kr(1) 6147 .sr(1) 6148 .m(m) 6149 .n(n) 6150 .k(4) 6151 .iterations(1) 6152 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6153 } 6154 } 6155 } 6156 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)6157 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 6158 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6159 for (uint32_t m = 1; m <= 6; m++) { 6160 GemmMicrokernelTester() 6161 .mr(6) 6162 .nr(8) 6163 .kr(1) 6164 .sr(1) 6165 .m(m) 6166 .n(8) 6167 .k(4) 6168 .iterations(1) 6169 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6170 } 6171 } 6172 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)6173 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 6174 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6175 for (uint32_t n = 1; n <= 8; n++) { 6176 GemmMicrokernelTester() 6177 .mr(6) 6178 .nr(8) 6179 .kr(1) 6180 .sr(1) 6181 .m(6) 6182 .n(n) 6183 .k(4) 6184 .iterations(1) 6185 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6186 } 6187 } 6188 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_lt_4)6189 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4) { 6190 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6191 for (size_t k = 1; k < 4; k++) { 6192 GemmMicrokernelTester() 6193 .mr(6) 6194 .nr(8) 6195 .kr(1) 6196 .sr(1) 6197 .m(6) 6198 .n(8) 6199 .k(k) 6200 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6201 } 6202 } 6203 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_lt_4_subtile)6204 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 6205 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6206 for (size_t k = 1; k < 4; k++) { 6207 for (uint32_t n = 1; n <= 8; n++) { 6208 for (uint32_t m = 1; m <= 6; m++) { 6209 GemmMicrokernelTester() 6210 .mr(6) 6211 .nr(8) 6212 .kr(1) 6213 .sr(1) 6214 .m(m) 6215 .n(n) 6216 .k(k) 6217 .iterations(1) 6218 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6219 } 6220 } 6221 } 6222 } 6223 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_gt_4)6224 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4) { 6225 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6226 for (size_t k = 5; k < 8; k++) { 6227 GemmMicrokernelTester() 6228 .mr(6) 6229 .nr(8) 6230 .kr(1) 6231 .sr(1) 6232 .m(6) 6233 .n(8) 6234 .k(k) 6235 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6236 } 6237 } 6238 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_gt_4_subtile)6239 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 6240 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6241 for (size_t k = 5; k < 8; k++) { 6242 for (uint32_t n = 1; n <= 8; n++) { 6243 for (uint32_t m = 1; m <= 6; m++) { 6244 GemmMicrokernelTester() 6245 .mr(6) 6246 .nr(8) 6247 .kr(1) 6248 .sr(1) 6249 .m(m) 6250 .n(n) 6251 .k(k) 6252 .iterations(1) 6253 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6254 } 6255 } 6256 } 6257 } 6258 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_div_4)6259 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4) { 6260 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6261 for (size_t k = 8; k <= 40; k += 4) { 6262 GemmMicrokernelTester() 6263 .mr(6) 6264 .nr(8) 6265 .kr(1) 6266 .sr(1) 6267 .m(6) 6268 .n(8) 6269 .k(k) 6270 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6271 } 6272 } 6273 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,k_div_4_subtile)6274 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 6275 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6276 for (size_t k = 8; k <= 40; k += 4) { 6277 for (uint32_t n = 1; n <= 8; n++) { 6278 for (uint32_t m = 1; m <= 6; m++) { 6279 GemmMicrokernelTester() 6280 .mr(6) 6281 .nr(8) 6282 .kr(1) 6283 .sr(1) 6284 .m(m) 6285 .n(n) 6286 .k(k) 6287 .iterations(1) 6288 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6289 } 6290 } 6291 } 6292 } 6293 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8)6294 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8) { 6295 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6296 for (uint32_t n = 9; n < 16; n++) { 6297 for (size_t k = 1; k <= 20; k += 5) { 6298 GemmMicrokernelTester() 6299 .mr(6) 6300 .nr(8) 6301 .kr(1) 6302 .sr(1) 6303 .m(6) 6304 .n(n) 6305 .k(k) 6306 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6307 } 6308 } 6309 } 6310 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)6311 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 6312 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6313 for (uint32_t n = 9; n < 16; n++) { 6314 for (size_t k = 1; k <= 20; k += 5) { 6315 GemmMicrokernelTester() 6316 .mr(6) 6317 .nr(8) 6318 .kr(1) 6319 .sr(1) 6320 .m(6) 6321 .n(n) 6322 .k(k) 6323 .cn_stride(11) 6324 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6325 } 6326 } 6327 } 6328 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_subtile)6329 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 6330 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6331 for (uint32_t n = 9; n < 16; n++) { 6332 for (size_t k = 1; k <= 20; k += 5) { 6333 for (uint32_t m = 1; m <= 6; m++) { 6334 GemmMicrokernelTester() 6335 .mr(6) 6336 .nr(8) 6337 .kr(1) 6338 .sr(1) 6339 .m(m) 6340 .n(n) 6341 .k(k) 6342 .iterations(1) 6343 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6344 } 6345 } 6346 } 6347 } 6348 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8)6349 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8) { 6350 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6351 for (uint32_t n = 16; n <= 24; n += 8) { 6352 for (size_t k = 1; k <= 20; k += 5) { 6353 GemmMicrokernelTester() 6354 .mr(6) 6355 .nr(8) 6356 .kr(1) 6357 .sr(1) 6358 .m(6) 6359 .n(n) 6360 .k(k) 6361 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6362 } 6363 } 6364 } 6365 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)6366 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 6367 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6368 for (uint32_t n = 16; n <= 24; n += 8) { 6369 for (size_t k = 1; k <= 20; k += 5) { 6370 GemmMicrokernelTester() 6371 .mr(6) 6372 .nr(8) 6373 .kr(1) 6374 .sr(1) 6375 .m(6) 6376 .n(n) 6377 .k(k) 6378 .cn_stride(11) 6379 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6380 } 6381 } 6382 } 6383 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_subtile)6384 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 6385 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6386 for (uint32_t n = 16; n <= 24; n += 8) { 6387 for (size_t k = 1; k <= 20; k += 5) { 6388 for (uint32_t m = 1; m <= 6; m++) { 6389 GemmMicrokernelTester() 6390 .mr(6) 6391 .nr(8) 6392 .kr(1) 6393 .sr(1) 6394 .m(m) 6395 .n(n) 6396 .k(k) 6397 .iterations(1) 6398 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6399 } 6400 } 6401 } 6402 } 6403 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,small_kernel)6404 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, small_kernel) { 6405 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6406 for (size_t k = 1; k <= 20; k += 5) { 6407 GemmMicrokernelTester() 6408 .mr(6) 6409 .nr(8) 6410 .kr(1) 6411 .sr(1) 6412 .m(6) 6413 .n(8) 6414 .k(k) 6415 .ks(3) 6416 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6417 } 6418 } 6419 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,small_kernel_subtile)6420 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, small_kernel_subtile) { 6421 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6422 for (size_t k = 1; k <= 20; k += 5) { 6423 for (uint32_t n = 1; n <= 8; n++) { 6424 for (uint32_t m = 1; m <= 6; m++) { 6425 GemmMicrokernelTester() 6426 .mr(6) 6427 .nr(8) 6428 .kr(1) 6429 .sr(1) 6430 .m(m) 6431 .n(n) 6432 .k(k) 6433 .ks(3) 6434 .iterations(1) 6435 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6436 } 6437 } 6438 } 6439 } 6440 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_gt_8_small_kernel)6441 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_small_kernel) { 6442 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6443 for (uint32_t n = 9; n < 16; n++) { 6444 for (size_t k = 1; k <= 20; k += 5) { 6445 GemmMicrokernelTester() 6446 .mr(6) 6447 .nr(8) 6448 .kr(1) 6449 .sr(1) 6450 .m(6) 6451 .n(n) 6452 .k(k) 6453 .ks(3) 6454 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6455 } 6456 } 6457 } 6458 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,n_div_8_small_kernel)6459 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_small_kernel) { 6460 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6461 for (uint32_t n = 16; n <= 24; n += 8) { 6462 for (size_t k = 1; k <= 20; k += 5) { 6463 GemmMicrokernelTester() 6464 .mr(6) 6465 .nr(8) 6466 .kr(1) 6467 .sr(1) 6468 .m(6) 6469 .n(n) 6470 .k(k) 6471 .ks(3) 6472 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6473 } 6474 } 6475 } 6476 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cm_subtile)6477 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 6478 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6479 for (size_t k = 1; k <= 20; k += 5) { 6480 for (uint32_t n = 1; n <= 8; n++) { 6481 for (uint32_t m = 1; m <= 6; m++) { 6482 GemmMicrokernelTester() 6483 .mr(6) 6484 .nr(8) 6485 .kr(1) 6486 .sr(1) 6487 .m(m) 6488 .n(n) 6489 .k(k) 6490 .cm_stride(11) 6491 .iterations(1) 6492 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6493 } 6494 } 6495 } 6496 } 6497 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,a_offset)6498 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, a_offset) { 6499 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6500 for (size_t k = 1; k <= 20; k += 5) { 6501 GemmMicrokernelTester() 6502 .mr(6) 6503 .nr(8) 6504 .kr(1) 6505 .sr(1) 6506 .m(6) 6507 .n(8) 6508 .k(k) 6509 .ks(3) 6510 .a_offset(127) 6511 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6512 } 6513 } 6514 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,zero)6515 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, zero) { 6516 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6517 for (size_t k = 1; k <= 20; k += 5) { 6518 for (uint32_t mz = 0; mz < 6; mz++) { 6519 GemmMicrokernelTester() 6520 .mr(6) 6521 .nr(8) 6522 .kr(1) 6523 .sr(1) 6524 .m(6) 6525 .n(8) 6526 .k(k) 6527 .ks(3) 6528 .a_offset(127) 6529 .zero_index(mz) 6530 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6531 } 6532 } 6533 } 6534 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,qmin)6535 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmin) { 6536 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6537 GemmMicrokernelTester() 6538 .mr(6) 6539 .nr(8) 6540 .kr(1) 6541 .sr(1) 6542 .m(6) 6543 .n(8) 6544 .k(4) 6545 .qmin(128) 6546 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6547 } 6548 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,qmax)6549 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmax) { 6550 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6551 GemmMicrokernelTester() 6552 .mr(6) 6553 .nr(8) 6554 .kr(1) 6555 .sr(1) 6556 .m(6) 6557 .n(8) 6558 .k(4) 6559 .qmax(128) 6560 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6561 } 6562 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64,strided_cm)6563 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm) { 6564 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6565 GemmMicrokernelTester() 6566 .mr(6) 6567 .nr(8) 6568 .kr(1) 6569 .sr(1) 6570 .m(6) 6571 .n(8) 6572 .k(4) 6573 .cm_stride(11) 6574 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6575 } 6576 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 6577 6578 6579 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4)6580 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4) { 6581 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6582 GemmMicrokernelTester() 6583 .mr(6) 6584 .nr(16) 6585 .kr(1) 6586 .sr(1) 6587 .m(6) 6588 .n(16) 6589 .k(4) 6590 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6591 } 6592 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cn)6593 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cn) { 6594 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6595 GemmMicrokernelTester() 6596 .mr(6) 6597 .nr(16) 6598 .kr(1) 6599 .sr(1) 6600 .m(6) 6601 .n(16) 6602 .k(4) 6603 .cn_stride(19) 6604 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6605 } 6606 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile)6607 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 6608 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6609 for (uint32_t n = 1; n <= 16; n++) { 6610 for (uint32_t m = 1; m <= 6; m++) { 6611 GemmMicrokernelTester() 6612 .mr(6) 6613 .nr(16) 6614 .kr(1) 6615 .sr(1) 6616 .m(m) 6617 .n(n) 6618 .k(4) 6619 .iterations(1) 6620 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6621 } 6622 } 6623 } 6624 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)6625 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 6626 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6627 for (uint32_t m = 1; m <= 6; m++) { 6628 GemmMicrokernelTester() 6629 .mr(6) 6630 .nr(16) 6631 .kr(1) 6632 .sr(1) 6633 .m(m) 6634 .n(16) 6635 .k(4) 6636 .iterations(1) 6637 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6638 } 6639 } 6640 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)6641 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 6642 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6643 for (uint32_t n = 1; n <= 16; n++) { 6644 GemmMicrokernelTester() 6645 .mr(6) 6646 .nr(16) 6647 .kr(1) 6648 .sr(1) 6649 .m(6) 6650 .n(n) 6651 .k(4) 6652 .iterations(1) 6653 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6654 } 6655 } 6656 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_lt_4)6657 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4) { 6658 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6659 for (size_t k = 1; k < 4; k++) { 6660 GemmMicrokernelTester() 6661 .mr(6) 6662 .nr(16) 6663 .kr(1) 6664 .sr(1) 6665 .m(6) 6666 .n(16) 6667 .k(k) 6668 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6669 } 6670 } 6671 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_lt_4_subtile)6672 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 6673 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6674 for (size_t k = 1; k < 4; k++) { 6675 for (uint32_t n = 1; n <= 16; n++) { 6676 for (uint32_t m = 1; m <= 6; m++) { 6677 GemmMicrokernelTester() 6678 .mr(6) 6679 .nr(16) 6680 .kr(1) 6681 .sr(1) 6682 .m(m) 6683 .n(n) 6684 .k(k) 6685 .iterations(1) 6686 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6687 } 6688 } 6689 } 6690 } 6691 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_gt_4)6692 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4) { 6693 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6694 for (size_t k = 5; k < 8; k++) { 6695 GemmMicrokernelTester() 6696 .mr(6) 6697 .nr(16) 6698 .kr(1) 6699 .sr(1) 6700 .m(6) 6701 .n(16) 6702 .k(k) 6703 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6704 } 6705 } 6706 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_gt_4_subtile)6707 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 6708 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6709 for (size_t k = 5; k < 8; k++) { 6710 for (uint32_t n = 1; n <= 16; n++) { 6711 for (uint32_t m = 1; m <= 6; m++) { 6712 GemmMicrokernelTester() 6713 .mr(6) 6714 .nr(16) 6715 .kr(1) 6716 .sr(1) 6717 .m(m) 6718 .n(n) 6719 .k(k) 6720 .iterations(1) 6721 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6722 } 6723 } 6724 } 6725 } 6726 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_div_4)6727 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4) { 6728 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6729 for (size_t k = 8; k <= 40; k += 4) { 6730 GemmMicrokernelTester() 6731 .mr(6) 6732 .nr(16) 6733 .kr(1) 6734 .sr(1) 6735 .m(6) 6736 .n(16) 6737 .k(k) 6738 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6739 } 6740 } 6741 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,k_div_4_subtile)6742 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 6743 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6744 for (size_t k = 8; k <= 40; k += 4) { 6745 for (uint32_t n = 1; n <= 16; n++) { 6746 for (uint32_t m = 1; m <= 6; m++) { 6747 GemmMicrokernelTester() 6748 .mr(6) 6749 .nr(16) 6750 .kr(1) 6751 .sr(1) 6752 .m(m) 6753 .n(n) 6754 .k(k) 6755 .iterations(1) 6756 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6757 } 6758 } 6759 } 6760 } 6761 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16)6762 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16) { 6763 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6764 for (uint32_t n = 17; n < 32; n++) { 6765 for (size_t k = 1; k <= 20; k += 5) { 6766 GemmMicrokernelTester() 6767 .mr(6) 6768 .nr(16) 6769 .kr(1) 6770 .sr(1) 6771 .m(6) 6772 .n(n) 6773 .k(k) 6774 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6775 } 6776 } 6777 } 6778 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)6779 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 6780 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6781 for (uint32_t n = 17; n < 32; n++) { 6782 for (size_t k = 1; k <= 20; k += 5) { 6783 GemmMicrokernelTester() 6784 .mr(6) 6785 .nr(16) 6786 .kr(1) 6787 .sr(1) 6788 .m(6) 6789 .n(n) 6790 .k(k) 6791 .cn_stride(19) 6792 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6793 } 6794 } 6795 } 6796 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_subtile)6797 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 6798 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6799 for (uint32_t n = 17; n < 32; n++) { 6800 for (size_t k = 1; k <= 20; k += 5) { 6801 for (uint32_t m = 1; m <= 6; m++) { 6802 GemmMicrokernelTester() 6803 .mr(6) 6804 .nr(16) 6805 .kr(1) 6806 .sr(1) 6807 .m(m) 6808 .n(n) 6809 .k(k) 6810 .iterations(1) 6811 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6812 } 6813 } 6814 } 6815 } 6816 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16)6817 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16) { 6818 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6819 for (uint32_t n = 32; n <= 48; n += 16) { 6820 for (size_t k = 1; k <= 20; k += 5) { 6821 GemmMicrokernelTester() 6822 .mr(6) 6823 .nr(16) 6824 .kr(1) 6825 .sr(1) 6826 .m(6) 6827 .n(n) 6828 .k(k) 6829 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6830 } 6831 } 6832 } 6833 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)6834 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 6835 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6836 for (uint32_t n = 32; n <= 48; n += 16) { 6837 for (size_t k = 1; k <= 20; k += 5) { 6838 GemmMicrokernelTester() 6839 .mr(6) 6840 .nr(16) 6841 .kr(1) 6842 .sr(1) 6843 .m(6) 6844 .n(n) 6845 .k(k) 6846 .cn_stride(19) 6847 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6848 } 6849 } 6850 } 6851 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_subtile)6852 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 6853 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6854 for (uint32_t n = 32; n <= 48; n += 16) { 6855 for (size_t k = 1; k <= 20; k += 5) { 6856 for (uint32_t m = 1; m <= 6; m++) { 6857 GemmMicrokernelTester() 6858 .mr(6) 6859 .nr(16) 6860 .kr(1) 6861 .sr(1) 6862 .m(m) 6863 .n(n) 6864 .k(k) 6865 .iterations(1) 6866 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6867 } 6868 } 6869 } 6870 } 6871 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,small_kernel)6872 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, small_kernel) { 6873 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6874 for (size_t k = 1; k <= 20; k += 5) { 6875 GemmMicrokernelTester() 6876 .mr(6) 6877 .nr(16) 6878 .kr(1) 6879 .sr(1) 6880 .m(6) 6881 .n(16) 6882 .k(k) 6883 .ks(3) 6884 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6885 } 6886 } 6887 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,small_kernel_subtile)6888 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, small_kernel_subtile) { 6889 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6890 for (size_t k = 1; k <= 20; k += 5) { 6891 for (uint32_t n = 1; n <= 16; n++) { 6892 for (uint32_t m = 1; m <= 6; m++) { 6893 GemmMicrokernelTester() 6894 .mr(6) 6895 .nr(16) 6896 .kr(1) 6897 .sr(1) 6898 .m(m) 6899 .n(n) 6900 .k(k) 6901 .ks(3) 6902 .iterations(1) 6903 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6904 } 6905 } 6906 } 6907 } 6908 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_gt_16_small_kernel)6909 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_small_kernel) { 6910 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6911 for (uint32_t n = 17; n < 32; n++) { 6912 for (size_t k = 1; k <= 20; k += 5) { 6913 GemmMicrokernelTester() 6914 .mr(6) 6915 .nr(16) 6916 .kr(1) 6917 .sr(1) 6918 .m(6) 6919 .n(n) 6920 .k(k) 6921 .ks(3) 6922 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6923 } 6924 } 6925 } 6926 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,n_div_16_small_kernel)6927 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_small_kernel) { 6928 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6929 for (uint32_t n = 32; n <= 48; n += 16) { 6930 for (size_t k = 1; k <= 20; k += 5) { 6931 GemmMicrokernelTester() 6932 .mr(6) 6933 .nr(16) 6934 .kr(1) 6935 .sr(1) 6936 .m(6) 6937 .n(n) 6938 .k(k) 6939 .ks(3) 6940 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6941 } 6942 } 6943 } 6944 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cm_subtile)6945 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 6946 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6947 for (size_t k = 1; k <= 20; k += 5) { 6948 for (uint32_t n = 1; n <= 16; n++) { 6949 for (uint32_t m = 1; m <= 6; m++) { 6950 GemmMicrokernelTester() 6951 .mr(6) 6952 .nr(16) 6953 .kr(1) 6954 .sr(1) 6955 .m(m) 6956 .n(n) 6957 .k(k) 6958 .cm_stride(19) 6959 .iterations(1) 6960 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6961 } 6962 } 6963 } 6964 } 6965 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,a_offset)6966 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, a_offset) { 6967 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6968 for (size_t k = 1; k <= 20; k += 5) { 6969 GemmMicrokernelTester() 6970 .mr(6) 6971 .nr(16) 6972 .kr(1) 6973 .sr(1) 6974 .m(6) 6975 .n(16) 6976 .k(k) 6977 .ks(3) 6978 .a_offset(127) 6979 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6980 } 6981 } 6982 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,zero)6983 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, zero) { 6984 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 6985 for (size_t k = 1; k <= 20; k += 5) { 6986 for (uint32_t mz = 0; mz < 6; mz++) { 6987 GemmMicrokernelTester() 6988 .mr(6) 6989 .nr(16) 6990 .kr(1) 6991 .sr(1) 6992 .m(6) 6993 .n(16) 6994 .k(k) 6995 .ks(3) 6996 .a_offset(127) 6997 .zero_index(mz) 6998 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 6999 } 7000 } 7001 } 7002 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,qmin)7003 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmin) { 7004 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7005 GemmMicrokernelTester() 7006 .mr(6) 7007 .nr(16) 7008 .kr(1) 7009 .sr(1) 7010 .m(6) 7011 .n(16) 7012 .k(4) 7013 .qmin(128) 7014 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7015 } 7016 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,qmax)7017 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmax) { 7018 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7019 GemmMicrokernelTester() 7020 .mr(6) 7021 .nr(16) 7022 .kr(1) 7023 .sr(1) 7024 .m(6) 7025 .n(16) 7026 .k(4) 7027 .qmax(128) 7028 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7029 } 7030 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64,strided_cm)7031 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm) { 7032 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7033 GemmMicrokernelTester() 7034 .mr(6) 7035 .nr(16) 7036 .kr(1) 7037 .sr(1) 7038 .m(6) 7039 .n(16) 7040 .k(4) 7041 .cm_stride(19) 7042 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7043 } 7044 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 7045 7046 7047 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4)7048 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4) { 7049 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7050 GemmMicrokernelTester() 7051 .mr(8) 7052 .nr(8) 7053 .kr(1) 7054 .sr(1) 7055 .m(8) 7056 .n(8) 7057 .k(4) 7058 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7059 } 7060 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cn)7061 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cn) { 7062 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7063 GemmMicrokernelTester() 7064 .mr(8) 7065 .nr(8) 7066 .kr(1) 7067 .sr(1) 7068 .m(8) 7069 .n(8) 7070 .k(4) 7071 .cn_stride(11) 7072 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7073 } 7074 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile)7075 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile) { 7076 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7077 for (uint32_t n = 1; n <= 8; n++) { 7078 for (uint32_t m = 1; m <= 8; m++) { 7079 GemmMicrokernelTester() 7080 .mr(8) 7081 .nr(8) 7082 .kr(1) 7083 .sr(1) 7084 .m(m) 7085 .n(n) 7086 .k(4) 7087 .iterations(1) 7088 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7089 } 7090 } 7091 } 7092 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile_m)7093 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 7094 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7095 for (uint32_t m = 1; m <= 8; m++) { 7096 GemmMicrokernelTester() 7097 .mr(8) 7098 .nr(8) 7099 .kr(1) 7100 .sr(1) 7101 .m(m) 7102 .n(8) 7103 .k(4) 7104 .iterations(1) 7105 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7106 } 7107 } 7108 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_eq_4_subtile_n)7109 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 7110 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7111 for (uint32_t n = 1; n <= 8; n++) { 7112 GemmMicrokernelTester() 7113 .mr(8) 7114 .nr(8) 7115 .kr(1) 7116 .sr(1) 7117 .m(8) 7118 .n(n) 7119 .k(4) 7120 .iterations(1) 7121 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7122 } 7123 } 7124 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_lt_4)7125 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4) { 7126 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7127 for (size_t k = 1; k < 4; k++) { 7128 GemmMicrokernelTester() 7129 .mr(8) 7130 .nr(8) 7131 .kr(1) 7132 .sr(1) 7133 .m(8) 7134 .n(8) 7135 .k(k) 7136 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7137 } 7138 } 7139 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_lt_4_subtile)7140 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4_subtile) { 7141 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7142 for (size_t k = 1; k < 4; k++) { 7143 for (uint32_t n = 1; n <= 8; n++) { 7144 for (uint32_t m = 1; m <= 8; m++) { 7145 GemmMicrokernelTester() 7146 .mr(8) 7147 .nr(8) 7148 .kr(1) 7149 .sr(1) 7150 .m(m) 7151 .n(n) 7152 .k(k) 7153 .iterations(1) 7154 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7155 } 7156 } 7157 } 7158 } 7159 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_gt_4)7160 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4) { 7161 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7162 for (size_t k = 5; k < 8; k++) { 7163 GemmMicrokernelTester() 7164 .mr(8) 7165 .nr(8) 7166 .kr(1) 7167 .sr(1) 7168 .m(8) 7169 .n(8) 7170 .k(k) 7171 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7172 } 7173 } 7174 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_gt_4_subtile)7175 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4_subtile) { 7176 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7177 for (size_t k = 5; k < 8; k++) { 7178 for (uint32_t n = 1; n <= 8; n++) { 7179 for (uint32_t m = 1; m <= 8; m++) { 7180 GemmMicrokernelTester() 7181 .mr(8) 7182 .nr(8) 7183 .kr(1) 7184 .sr(1) 7185 .m(m) 7186 .n(n) 7187 .k(k) 7188 .iterations(1) 7189 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7190 } 7191 } 7192 } 7193 } 7194 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_div_4)7195 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4) { 7196 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7197 for (size_t k = 8; k <= 40; k += 4) { 7198 GemmMicrokernelTester() 7199 .mr(8) 7200 .nr(8) 7201 .kr(1) 7202 .sr(1) 7203 .m(8) 7204 .n(8) 7205 .k(k) 7206 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7207 } 7208 } 7209 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,k_div_4_subtile)7210 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4_subtile) { 7211 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7212 for (size_t k = 8; k <= 40; k += 4) { 7213 for (uint32_t n = 1; n <= 8; n++) { 7214 for (uint32_t m = 1; m <= 8; m++) { 7215 GemmMicrokernelTester() 7216 .mr(8) 7217 .nr(8) 7218 .kr(1) 7219 .sr(1) 7220 .m(m) 7221 .n(n) 7222 .k(k) 7223 .iterations(1) 7224 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7225 } 7226 } 7227 } 7228 } 7229 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8)7230 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8) { 7231 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7232 for (uint32_t n = 9; n < 16; n++) { 7233 for (size_t k = 1; k <= 20; k += 5) { 7234 GemmMicrokernelTester() 7235 .mr(8) 7236 .nr(8) 7237 .kr(1) 7238 .sr(1) 7239 .m(8) 7240 .n(n) 7241 .k(k) 7242 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7243 } 7244 } 7245 } 7246 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_strided_cn)7247 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) { 7248 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7249 for (uint32_t n = 9; n < 16; n++) { 7250 for (size_t k = 1; k <= 20; k += 5) { 7251 GemmMicrokernelTester() 7252 .mr(8) 7253 .nr(8) 7254 .kr(1) 7255 .sr(1) 7256 .m(8) 7257 .n(n) 7258 .k(k) 7259 .cn_stride(11) 7260 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7261 } 7262 } 7263 } 7264 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_subtile)7265 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_subtile) { 7266 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7267 for (uint32_t n = 9; n < 16; n++) { 7268 for (size_t k = 1; k <= 20; k += 5) { 7269 for (uint32_t m = 1; m <= 8; m++) { 7270 GemmMicrokernelTester() 7271 .mr(8) 7272 .nr(8) 7273 .kr(1) 7274 .sr(1) 7275 .m(m) 7276 .n(n) 7277 .k(k) 7278 .iterations(1) 7279 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7280 } 7281 } 7282 } 7283 } 7284 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8)7285 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8) { 7286 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7287 for (uint32_t n = 16; n <= 24; n += 8) { 7288 for (size_t k = 1; k <= 20; k += 5) { 7289 GemmMicrokernelTester() 7290 .mr(8) 7291 .nr(8) 7292 .kr(1) 7293 .sr(1) 7294 .m(8) 7295 .n(n) 7296 .k(k) 7297 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7298 } 7299 } 7300 } 7301 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_strided_cn)7302 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) { 7303 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7304 for (uint32_t n = 16; n <= 24; n += 8) { 7305 for (size_t k = 1; k <= 20; k += 5) { 7306 GemmMicrokernelTester() 7307 .mr(8) 7308 .nr(8) 7309 .kr(1) 7310 .sr(1) 7311 .m(8) 7312 .n(n) 7313 .k(k) 7314 .cn_stride(11) 7315 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7316 } 7317 } 7318 } 7319 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_subtile)7320 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_subtile) { 7321 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7322 for (uint32_t n = 16; n <= 24; n += 8) { 7323 for (size_t k = 1; k <= 20; k += 5) { 7324 for (uint32_t m = 1; m <= 8; m++) { 7325 GemmMicrokernelTester() 7326 .mr(8) 7327 .nr(8) 7328 .kr(1) 7329 .sr(1) 7330 .m(m) 7331 .n(n) 7332 .k(k) 7333 .iterations(1) 7334 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7335 } 7336 } 7337 } 7338 } 7339 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,small_kernel)7340 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, small_kernel) { 7341 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7342 for (size_t k = 1; k <= 20; k += 5) { 7343 GemmMicrokernelTester() 7344 .mr(8) 7345 .nr(8) 7346 .kr(1) 7347 .sr(1) 7348 .m(8) 7349 .n(8) 7350 .k(k) 7351 .ks(3) 7352 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7353 } 7354 } 7355 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,small_kernel_subtile)7356 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, small_kernel_subtile) { 7357 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7358 for (size_t k = 1; k <= 20; k += 5) { 7359 for (uint32_t n = 1; n <= 8; n++) { 7360 for (uint32_t m = 1; m <= 8; m++) { 7361 GemmMicrokernelTester() 7362 .mr(8) 7363 .nr(8) 7364 .kr(1) 7365 .sr(1) 7366 .m(m) 7367 .n(n) 7368 .k(k) 7369 .ks(3) 7370 .iterations(1) 7371 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7372 } 7373 } 7374 } 7375 } 7376 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_gt_8_small_kernel)7377 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_small_kernel) { 7378 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7379 for (uint32_t n = 9; n < 16; n++) { 7380 for (size_t k = 1; k <= 20; k += 5) { 7381 GemmMicrokernelTester() 7382 .mr(8) 7383 .nr(8) 7384 .kr(1) 7385 .sr(1) 7386 .m(8) 7387 .n(n) 7388 .k(k) 7389 .ks(3) 7390 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7391 } 7392 } 7393 } 7394 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,n_div_8_small_kernel)7395 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_small_kernel) { 7396 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7397 for (uint32_t n = 16; n <= 24; n += 8) { 7398 for (size_t k = 1; k <= 20; k += 5) { 7399 GemmMicrokernelTester() 7400 .mr(8) 7401 .nr(8) 7402 .kr(1) 7403 .sr(1) 7404 .m(8) 7405 .n(n) 7406 .k(k) 7407 .ks(3) 7408 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7409 } 7410 } 7411 } 7412 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cm_subtile)7413 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm_subtile) { 7414 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7415 for (size_t k = 1; k <= 20; k += 5) { 7416 for (uint32_t n = 1; n <= 8; n++) { 7417 for (uint32_t m = 1; m <= 8; m++) { 7418 GemmMicrokernelTester() 7419 .mr(8) 7420 .nr(8) 7421 .kr(1) 7422 .sr(1) 7423 .m(m) 7424 .n(n) 7425 .k(k) 7426 .cm_stride(11) 7427 .iterations(1) 7428 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7429 } 7430 } 7431 } 7432 } 7433 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,a_offset)7434 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, a_offset) { 7435 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7436 for (size_t k = 1; k <= 20; k += 5) { 7437 GemmMicrokernelTester() 7438 .mr(8) 7439 .nr(8) 7440 .kr(1) 7441 .sr(1) 7442 .m(8) 7443 .n(8) 7444 .k(k) 7445 .ks(3) 7446 .a_offset(163) 7447 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7448 } 7449 } 7450 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,zero)7451 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, zero) { 7452 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7453 for (size_t k = 1; k <= 20; k += 5) { 7454 for (uint32_t mz = 0; mz < 8; mz++) { 7455 GemmMicrokernelTester() 7456 .mr(8) 7457 .nr(8) 7458 .kr(1) 7459 .sr(1) 7460 .m(8) 7461 .n(8) 7462 .k(k) 7463 .ks(3) 7464 .a_offset(163) 7465 .zero_index(mz) 7466 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7467 } 7468 } 7469 } 7470 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,qmin)7471 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmin) { 7472 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7473 GemmMicrokernelTester() 7474 .mr(8) 7475 .nr(8) 7476 .kr(1) 7477 .sr(1) 7478 .m(8) 7479 .n(8) 7480 .k(4) 7481 .qmin(128) 7482 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7483 } 7484 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,qmax)7485 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmax) { 7486 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7487 GemmMicrokernelTester() 7488 .mr(8) 7489 .nr(8) 7490 .kr(1) 7491 .sr(1) 7492 .m(8) 7493 .n(8) 7494 .k(4) 7495 .qmax(128) 7496 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7497 } 7498 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64,strided_cm)7499 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm) { 7500 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7501 GemmMicrokernelTester() 7502 .mr(8) 7503 .nr(8) 7504 .kr(1) 7505 .sr(1) 7506 .m(8) 7507 .n(8) 7508 .k(4) 7509 .cm_stride(11) 7510 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7511 } 7512 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 7513 7514 7515 #if XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4)7516 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4) { 7517 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7518 GemmMicrokernelTester() 7519 .mr(8) 7520 .nr(16) 7521 .kr(1) 7522 .sr(1) 7523 .m(8) 7524 .n(16) 7525 .k(4) 7526 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7527 } 7528 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cn)7529 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cn) { 7530 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7531 GemmMicrokernelTester() 7532 .mr(8) 7533 .nr(16) 7534 .kr(1) 7535 .sr(1) 7536 .m(8) 7537 .n(16) 7538 .k(4) 7539 .cn_stride(19) 7540 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7541 } 7542 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile)7543 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile) { 7544 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7545 for (uint32_t n = 1; n <= 16; n++) { 7546 for (uint32_t m = 1; m <= 8; m++) { 7547 GemmMicrokernelTester() 7548 .mr(8) 7549 .nr(16) 7550 .kr(1) 7551 .sr(1) 7552 .m(m) 7553 .n(n) 7554 .k(4) 7555 .iterations(1) 7556 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7557 } 7558 } 7559 } 7560 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile_m)7561 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) { 7562 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7563 for (uint32_t m = 1; m <= 8; m++) { 7564 GemmMicrokernelTester() 7565 .mr(8) 7566 .nr(16) 7567 .kr(1) 7568 .sr(1) 7569 .m(m) 7570 .n(16) 7571 .k(4) 7572 .iterations(1) 7573 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7574 } 7575 } 7576 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_eq_4_subtile_n)7577 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) { 7578 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7579 for (uint32_t n = 1; n <= 16; n++) { 7580 GemmMicrokernelTester() 7581 .mr(8) 7582 .nr(16) 7583 .kr(1) 7584 .sr(1) 7585 .m(8) 7586 .n(n) 7587 .k(4) 7588 .iterations(1) 7589 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7590 } 7591 } 7592 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_lt_4)7593 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4) { 7594 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7595 for (size_t k = 1; k < 4; k++) { 7596 GemmMicrokernelTester() 7597 .mr(8) 7598 .nr(16) 7599 .kr(1) 7600 .sr(1) 7601 .m(8) 7602 .n(16) 7603 .k(k) 7604 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7605 } 7606 } 7607 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_lt_4_subtile)7608 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4_subtile) { 7609 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7610 for (size_t k = 1; k < 4; k++) { 7611 for (uint32_t n = 1; n <= 16; n++) { 7612 for (uint32_t m = 1; m <= 8; m++) { 7613 GemmMicrokernelTester() 7614 .mr(8) 7615 .nr(16) 7616 .kr(1) 7617 .sr(1) 7618 .m(m) 7619 .n(n) 7620 .k(k) 7621 .iterations(1) 7622 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7623 } 7624 } 7625 } 7626 } 7627 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_gt_4)7628 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4) { 7629 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7630 for (size_t k = 5; k < 8; k++) { 7631 GemmMicrokernelTester() 7632 .mr(8) 7633 .nr(16) 7634 .kr(1) 7635 .sr(1) 7636 .m(8) 7637 .n(16) 7638 .k(k) 7639 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7640 } 7641 } 7642 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_gt_4_subtile)7643 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4_subtile) { 7644 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7645 for (size_t k = 5; k < 8; k++) { 7646 for (uint32_t n = 1; n <= 16; n++) { 7647 for (uint32_t m = 1; m <= 8; m++) { 7648 GemmMicrokernelTester() 7649 .mr(8) 7650 .nr(16) 7651 .kr(1) 7652 .sr(1) 7653 .m(m) 7654 .n(n) 7655 .k(k) 7656 .iterations(1) 7657 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7658 } 7659 } 7660 } 7661 } 7662 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_div_4)7663 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4) { 7664 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7665 for (size_t k = 8; k <= 40; k += 4) { 7666 GemmMicrokernelTester() 7667 .mr(8) 7668 .nr(16) 7669 .kr(1) 7670 .sr(1) 7671 .m(8) 7672 .n(16) 7673 .k(k) 7674 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7675 } 7676 } 7677 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,k_div_4_subtile)7678 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4_subtile) { 7679 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7680 for (size_t k = 8; k <= 40; k += 4) { 7681 for (uint32_t n = 1; n <= 16; n++) { 7682 for (uint32_t m = 1; m <= 8; m++) { 7683 GemmMicrokernelTester() 7684 .mr(8) 7685 .nr(16) 7686 .kr(1) 7687 .sr(1) 7688 .m(m) 7689 .n(n) 7690 .k(k) 7691 .iterations(1) 7692 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7693 } 7694 } 7695 } 7696 } 7697 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16)7698 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16) { 7699 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7700 for (uint32_t n = 17; n < 32; n++) { 7701 for (size_t k = 1; k <= 20; k += 5) { 7702 GemmMicrokernelTester() 7703 .mr(8) 7704 .nr(16) 7705 .kr(1) 7706 .sr(1) 7707 .m(8) 7708 .n(n) 7709 .k(k) 7710 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7711 } 7712 } 7713 } 7714 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_strided_cn)7715 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) { 7716 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7717 for (uint32_t n = 17; n < 32; n++) { 7718 for (size_t k = 1; k <= 20; k += 5) { 7719 GemmMicrokernelTester() 7720 .mr(8) 7721 .nr(16) 7722 .kr(1) 7723 .sr(1) 7724 .m(8) 7725 .n(n) 7726 .k(k) 7727 .cn_stride(19) 7728 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7729 } 7730 } 7731 } 7732 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_subtile)7733 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_subtile) { 7734 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7735 for (uint32_t n = 17; n < 32; n++) { 7736 for (size_t k = 1; k <= 20; k += 5) { 7737 for (uint32_t m = 1; m <= 8; m++) { 7738 GemmMicrokernelTester() 7739 .mr(8) 7740 .nr(16) 7741 .kr(1) 7742 .sr(1) 7743 .m(m) 7744 .n(n) 7745 .k(k) 7746 .iterations(1) 7747 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7748 } 7749 } 7750 } 7751 } 7752 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16)7753 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16) { 7754 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7755 for (uint32_t n = 32; n <= 48; n += 16) { 7756 for (size_t k = 1; k <= 20; k += 5) { 7757 GemmMicrokernelTester() 7758 .mr(8) 7759 .nr(16) 7760 .kr(1) 7761 .sr(1) 7762 .m(8) 7763 .n(n) 7764 .k(k) 7765 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7766 } 7767 } 7768 } 7769 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_strided_cn)7770 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) { 7771 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7772 for (uint32_t n = 32; n <= 48; n += 16) { 7773 for (size_t k = 1; k <= 20; k += 5) { 7774 GemmMicrokernelTester() 7775 .mr(8) 7776 .nr(16) 7777 .kr(1) 7778 .sr(1) 7779 .m(8) 7780 .n(n) 7781 .k(k) 7782 .cn_stride(19) 7783 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7784 } 7785 } 7786 } 7787 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_subtile)7788 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_subtile) { 7789 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7790 for (uint32_t n = 32; n <= 48; n += 16) { 7791 for (size_t k = 1; k <= 20; k += 5) { 7792 for (uint32_t m = 1; m <= 8; m++) { 7793 GemmMicrokernelTester() 7794 .mr(8) 7795 .nr(16) 7796 .kr(1) 7797 .sr(1) 7798 .m(m) 7799 .n(n) 7800 .k(k) 7801 .iterations(1) 7802 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7803 } 7804 } 7805 } 7806 } 7807 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,small_kernel)7808 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, small_kernel) { 7809 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7810 for (size_t k = 1; k <= 20; k += 5) { 7811 GemmMicrokernelTester() 7812 .mr(8) 7813 .nr(16) 7814 .kr(1) 7815 .sr(1) 7816 .m(8) 7817 .n(16) 7818 .k(k) 7819 .ks(3) 7820 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7821 } 7822 } 7823 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,small_kernel_subtile)7824 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, small_kernel_subtile) { 7825 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7826 for (size_t k = 1; k <= 20; k += 5) { 7827 for (uint32_t n = 1; n <= 16; n++) { 7828 for (uint32_t m = 1; m <= 8; m++) { 7829 GemmMicrokernelTester() 7830 .mr(8) 7831 .nr(16) 7832 .kr(1) 7833 .sr(1) 7834 .m(m) 7835 .n(n) 7836 .k(k) 7837 .ks(3) 7838 .iterations(1) 7839 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7840 } 7841 } 7842 } 7843 } 7844 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_gt_16_small_kernel)7845 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_small_kernel) { 7846 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7847 for (uint32_t n = 17; n < 32; n++) { 7848 for (size_t k = 1; k <= 20; k += 5) { 7849 GemmMicrokernelTester() 7850 .mr(8) 7851 .nr(16) 7852 .kr(1) 7853 .sr(1) 7854 .m(8) 7855 .n(n) 7856 .k(k) 7857 .ks(3) 7858 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7859 } 7860 } 7861 } 7862 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,n_div_16_small_kernel)7863 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_small_kernel) { 7864 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7865 for (uint32_t n = 32; n <= 48; n += 16) { 7866 for (size_t k = 1; k <= 20; k += 5) { 7867 GemmMicrokernelTester() 7868 .mr(8) 7869 .nr(16) 7870 .kr(1) 7871 .sr(1) 7872 .m(8) 7873 .n(n) 7874 .k(k) 7875 .ks(3) 7876 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7877 } 7878 } 7879 } 7880 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cm_subtile)7881 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm_subtile) { 7882 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7883 for (size_t k = 1; k <= 20; k += 5) { 7884 for (uint32_t n = 1; n <= 16; n++) { 7885 for (uint32_t m = 1; m <= 8; m++) { 7886 GemmMicrokernelTester() 7887 .mr(8) 7888 .nr(16) 7889 .kr(1) 7890 .sr(1) 7891 .m(m) 7892 .n(n) 7893 .k(k) 7894 .cm_stride(19) 7895 .iterations(1) 7896 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7897 } 7898 } 7899 } 7900 } 7901 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,a_offset)7902 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, a_offset) { 7903 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7904 for (size_t k = 1; k <= 20; k += 5) { 7905 GemmMicrokernelTester() 7906 .mr(8) 7907 .nr(16) 7908 .kr(1) 7909 .sr(1) 7910 .m(8) 7911 .n(16) 7912 .k(k) 7913 .ks(3) 7914 .a_offset(163) 7915 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7916 } 7917 } 7918 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,zero)7919 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, zero) { 7920 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7921 for (size_t k = 1; k <= 20; k += 5) { 7922 for (uint32_t mz = 0; mz < 8; mz++) { 7923 GemmMicrokernelTester() 7924 .mr(8) 7925 .nr(16) 7926 .kr(1) 7927 .sr(1) 7928 .m(8) 7929 .n(16) 7930 .k(k) 7931 .ks(3) 7932 .a_offset(163) 7933 .zero_index(mz) 7934 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7935 } 7936 } 7937 } 7938 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,qmin)7939 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmin) { 7940 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7941 GemmMicrokernelTester() 7942 .mr(8) 7943 .nr(16) 7944 .kr(1) 7945 .sr(1) 7946 .m(8) 7947 .n(16) 7948 .k(4) 7949 .qmin(128) 7950 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7951 } 7952 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,qmax)7953 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmax) { 7954 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7955 GemmMicrokernelTester() 7956 .mr(8) 7957 .nr(16) 7958 .kr(1) 7959 .sr(1) 7960 .m(8) 7961 .n(16) 7962 .k(4) 7963 .qmax(128) 7964 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7965 } 7966 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64,strided_cm)7967 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm) { 7968 TEST_REQUIRES_ARM_NEON_FP16_ARITH; 7969 GemmMicrokernelTester() 7970 .mr(8) 7971 .nr(16) 7972 .kr(1) 7973 .sr(1) 7974 .m(8) 7975 .n(16) 7976 .k(4) 7977 .cm_stride(19) 7978 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64, xnn_init_f16_minmax_neon_params); 7979 } 7980 #endif // XNN_ENABLE_ARM_FP16 && (XNN_ARCH_ARM || XNN_ARCH_ARM64) 7981 7982 7983 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1)7984 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1) { 7985 TEST_REQUIRES_X86_AVX2; 7986 GemmMicrokernelTester() 7987 .mr(1) 7988 .nr(8) 7989 .kr(1) 7990 .sr(1) 7991 .m(1) 7992 .n(8) 7993 .k(1) 7994 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 7995 } 7996 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,strided_cn)7997 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, strided_cn) { 7998 TEST_REQUIRES_X86_AVX2; 7999 GemmMicrokernelTester() 8000 .mr(1) 8001 .nr(8) 8002 .kr(1) 8003 .sr(1) 8004 .m(1) 8005 .n(8) 8006 .k(1) 8007 .cn_stride(11) 8008 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8009 } 8010 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1_subtile)8011 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1_subtile) { 8012 TEST_REQUIRES_X86_AVX2; 8013 for (uint32_t n = 1; n <= 8; n++) { 8014 for (uint32_t m = 1; m <= 1; m++) { 8015 GemmMicrokernelTester() 8016 .mr(1) 8017 .nr(8) 8018 .kr(1) 8019 .sr(1) 8020 .m(m) 8021 .n(n) 8022 .k(1) 8023 .iterations(1) 8024 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8025 } 8026 } 8027 } 8028 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1_subtile_m)8029 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 8030 TEST_REQUIRES_X86_AVX2; 8031 for (uint32_t m = 1; m <= 1; m++) { 8032 GemmMicrokernelTester() 8033 .mr(1) 8034 .nr(8) 8035 .kr(1) 8036 .sr(1) 8037 .m(m) 8038 .n(8) 8039 .k(1) 8040 .iterations(1) 8041 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8042 } 8043 } 8044 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,k_eq_1_subtile_n)8045 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 8046 TEST_REQUIRES_X86_AVX2; 8047 for (uint32_t n = 1; n <= 8; n++) { 8048 GemmMicrokernelTester() 8049 .mr(1) 8050 .nr(8) 8051 .kr(1) 8052 .sr(1) 8053 .m(1) 8054 .n(n) 8055 .k(1) 8056 .iterations(1) 8057 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8058 } 8059 } 8060 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,k_gt_1)8061 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, k_gt_1) { 8062 TEST_REQUIRES_X86_AVX2; 8063 for (size_t k = 2; k < 10; k++) { 8064 GemmMicrokernelTester() 8065 .mr(1) 8066 .nr(8) 8067 .kr(1) 8068 .sr(1) 8069 .m(1) 8070 .n(8) 8071 .k(k) 8072 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8073 } 8074 } 8075 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,k_gt_1_subtile)8076 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, k_gt_1_subtile) { 8077 TEST_REQUIRES_X86_AVX2; 8078 for (size_t k = 2; k < 10; k++) { 8079 for (uint32_t n = 1; n <= 8; n++) { 8080 for (uint32_t m = 1; m <= 1; m++) { 8081 GemmMicrokernelTester() 8082 .mr(1) 8083 .nr(8) 8084 .kr(1) 8085 .sr(1) 8086 .m(m) 8087 .n(n) 8088 .k(k) 8089 .iterations(1) 8090 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8091 } 8092 } 8093 } 8094 } 8095 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,n_gt_8)8096 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, n_gt_8) { 8097 TEST_REQUIRES_X86_AVX2; 8098 for (uint32_t n = 9; n < 16; n++) { 8099 for (size_t k = 1; k <= 5; k += 2) { 8100 GemmMicrokernelTester() 8101 .mr(1) 8102 .nr(8) 8103 .kr(1) 8104 .sr(1) 8105 .m(1) 8106 .n(n) 8107 .k(k) 8108 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8109 } 8110 } 8111 } 8112 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,n_gt_8_strided_cn)8113 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 8114 TEST_REQUIRES_X86_AVX2; 8115 for (uint32_t n = 9; n < 16; n++) { 8116 for (size_t k = 1; k <= 5; k += 2) { 8117 GemmMicrokernelTester() 8118 .mr(1) 8119 .nr(8) 8120 .kr(1) 8121 .sr(1) 8122 .m(1) 8123 .n(n) 8124 .k(k) 8125 .cn_stride(11) 8126 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8127 } 8128 } 8129 } 8130 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,n_gt_8_subtile)8131 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, n_gt_8_subtile) { 8132 TEST_REQUIRES_X86_AVX2; 8133 for (uint32_t n = 9; n < 16; n++) { 8134 for (size_t k = 1; k <= 5; k += 2) { 8135 for (uint32_t m = 1; m <= 1; m++) { 8136 GemmMicrokernelTester() 8137 .mr(1) 8138 .nr(8) 8139 .kr(1) 8140 .sr(1) 8141 .m(m) 8142 .n(n) 8143 .k(k) 8144 .iterations(1) 8145 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8146 } 8147 } 8148 } 8149 } 8150 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,n_div_8)8151 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, n_div_8) { 8152 TEST_REQUIRES_X86_AVX2; 8153 for (uint32_t n = 16; n <= 24; n += 8) { 8154 for (size_t k = 1; k <= 5; k += 2) { 8155 GemmMicrokernelTester() 8156 .mr(1) 8157 .nr(8) 8158 .kr(1) 8159 .sr(1) 8160 .m(1) 8161 .n(n) 8162 .k(k) 8163 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8164 } 8165 } 8166 } 8167 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,n_div_8_strided_cn)8168 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, n_div_8_strided_cn) { 8169 TEST_REQUIRES_X86_AVX2; 8170 for (uint32_t n = 16; n <= 24; n += 8) { 8171 for (size_t k = 1; k <= 5; k += 2) { 8172 GemmMicrokernelTester() 8173 .mr(1) 8174 .nr(8) 8175 .kr(1) 8176 .sr(1) 8177 .m(1) 8178 .n(n) 8179 .k(k) 8180 .cn_stride(11) 8181 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8182 } 8183 } 8184 } 8185 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,n_div_8_subtile)8186 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, n_div_8_subtile) { 8187 TEST_REQUIRES_X86_AVX2; 8188 for (uint32_t n = 16; n <= 24; n += 8) { 8189 for (size_t k = 1; k <= 5; k += 2) { 8190 for (uint32_t m = 1; m <= 1; m++) { 8191 GemmMicrokernelTester() 8192 .mr(1) 8193 .nr(8) 8194 .kr(1) 8195 .sr(1) 8196 .m(m) 8197 .n(n) 8198 .k(k) 8199 .iterations(1) 8200 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8201 } 8202 } 8203 } 8204 } 8205 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,small_kernel)8206 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, small_kernel) { 8207 TEST_REQUIRES_X86_AVX2; 8208 for (size_t k = 1; k <= 5; k += 2) { 8209 GemmMicrokernelTester() 8210 .mr(1) 8211 .nr(8) 8212 .kr(1) 8213 .sr(1) 8214 .m(1) 8215 .n(8) 8216 .k(k) 8217 .ks(3) 8218 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8219 } 8220 } 8221 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,small_kernel_subtile)8222 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, small_kernel_subtile) { 8223 TEST_REQUIRES_X86_AVX2; 8224 for (size_t k = 1; k <= 5; k += 2) { 8225 for (uint32_t n = 1; n <= 8; n++) { 8226 for (uint32_t m = 1; m <= 1; m++) { 8227 GemmMicrokernelTester() 8228 .mr(1) 8229 .nr(8) 8230 .kr(1) 8231 .sr(1) 8232 .m(m) 8233 .n(n) 8234 .k(k) 8235 .ks(3) 8236 .iterations(1) 8237 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8238 } 8239 } 8240 } 8241 } 8242 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,n_gt_8_small_kernel)8243 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, n_gt_8_small_kernel) { 8244 TEST_REQUIRES_X86_AVX2; 8245 for (uint32_t n = 9; n < 16; n++) { 8246 for (size_t k = 1; k <= 5; k += 2) { 8247 GemmMicrokernelTester() 8248 .mr(1) 8249 .nr(8) 8250 .kr(1) 8251 .sr(1) 8252 .m(1) 8253 .n(n) 8254 .k(k) 8255 .ks(3) 8256 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8257 } 8258 } 8259 } 8260 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,n_div_8_small_kernel)8261 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, n_div_8_small_kernel) { 8262 TEST_REQUIRES_X86_AVX2; 8263 for (uint32_t n = 16; n <= 24; n += 8) { 8264 for (size_t k = 1; k <= 5; k += 2) { 8265 GemmMicrokernelTester() 8266 .mr(1) 8267 .nr(8) 8268 .kr(1) 8269 .sr(1) 8270 .m(1) 8271 .n(n) 8272 .k(k) 8273 .ks(3) 8274 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8275 } 8276 } 8277 } 8278 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,strided_cm_subtile)8279 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, strided_cm_subtile) { 8280 TEST_REQUIRES_X86_AVX2; 8281 for (size_t k = 1; k <= 5; k += 2) { 8282 for (uint32_t n = 1; n <= 8; n++) { 8283 for (uint32_t m = 1; m <= 1; m++) { 8284 GemmMicrokernelTester() 8285 .mr(1) 8286 .nr(8) 8287 .kr(1) 8288 .sr(1) 8289 .m(m) 8290 .n(n) 8291 .k(k) 8292 .cm_stride(11) 8293 .iterations(1) 8294 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8295 } 8296 } 8297 } 8298 } 8299 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,a_offset)8300 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, a_offset) { 8301 TEST_REQUIRES_X86_AVX2; 8302 for (size_t k = 1; k <= 5; k += 2) { 8303 GemmMicrokernelTester() 8304 .mr(1) 8305 .nr(8) 8306 .kr(1) 8307 .sr(1) 8308 .m(1) 8309 .n(8) 8310 .k(k) 8311 .ks(3) 8312 .a_offset(7) 8313 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8314 } 8315 } 8316 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,zero)8317 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, zero) { 8318 TEST_REQUIRES_X86_AVX2; 8319 for (size_t k = 1; k <= 5; k += 2) { 8320 for (uint32_t mz = 0; mz < 1; mz++) { 8321 GemmMicrokernelTester() 8322 .mr(1) 8323 .nr(8) 8324 .kr(1) 8325 .sr(1) 8326 .m(1) 8327 .n(8) 8328 .k(k) 8329 .ks(3) 8330 .a_offset(7) 8331 .zero_index(mz) 8332 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8333 } 8334 } 8335 } 8336 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,qmin)8337 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, qmin) { 8338 TEST_REQUIRES_X86_AVX2; 8339 GemmMicrokernelTester() 8340 .mr(1) 8341 .nr(8) 8342 .kr(1) 8343 .sr(1) 8344 .m(1) 8345 .n(8) 8346 .k(1) 8347 .qmin(128) 8348 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8349 } 8350 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,qmax)8351 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, qmax) { 8352 TEST_REQUIRES_X86_AVX2; 8353 GemmMicrokernelTester() 8354 .mr(1) 8355 .nr(8) 8356 .kr(1) 8357 .sr(1) 8358 .m(1) 8359 .n(8) 8360 .k(1) 8361 .qmax(128) 8362 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8363 } 8364 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST,strided_cm)8365 TEST(F16_IGEMM_MINMAX_1X8__AVX2_BROADCAST, strided_cm) { 8366 TEST_REQUIRES_X86_AVX2; 8367 GemmMicrokernelTester() 8368 .mr(1) 8369 .nr(8) 8370 .kr(1) 8371 .sr(1) 8372 .m(1) 8373 .n(8) 8374 .k(1) 8375 .cm_stride(11) 8376 .Test(xnn_f16_igemm_minmax_ukernel_1x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8377 } 8378 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 8379 8380 8381 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1)8382 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1) { 8383 TEST_REQUIRES_X86_AVX2; 8384 GemmMicrokernelTester() 8385 .mr(1) 8386 .nr(16) 8387 .kr(1) 8388 .sr(1) 8389 .m(1) 8390 .n(16) 8391 .k(1) 8392 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8393 } 8394 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,strided_cn)8395 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, strided_cn) { 8396 TEST_REQUIRES_X86_AVX2; 8397 GemmMicrokernelTester() 8398 .mr(1) 8399 .nr(16) 8400 .kr(1) 8401 .sr(1) 8402 .m(1) 8403 .n(16) 8404 .k(1) 8405 .cn_stride(19) 8406 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8407 } 8408 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1_subtile)8409 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1_subtile) { 8410 TEST_REQUIRES_X86_AVX2; 8411 for (uint32_t n = 1; n <= 16; n++) { 8412 for (uint32_t m = 1; m <= 1; m++) { 8413 GemmMicrokernelTester() 8414 .mr(1) 8415 .nr(16) 8416 .kr(1) 8417 .sr(1) 8418 .m(m) 8419 .n(n) 8420 .k(1) 8421 .iterations(1) 8422 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8423 } 8424 } 8425 } 8426 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1_subtile_m)8427 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1_subtile_m) { 8428 TEST_REQUIRES_X86_AVX2; 8429 for (uint32_t m = 1; m <= 1; m++) { 8430 GemmMicrokernelTester() 8431 .mr(1) 8432 .nr(16) 8433 .kr(1) 8434 .sr(1) 8435 .m(m) 8436 .n(16) 8437 .k(1) 8438 .iterations(1) 8439 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8440 } 8441 } 8442 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,k_eq_1_subtile_n)8443 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, k_eq_1_subtile_n) { 8444 TEST_REQUIRES_X86_AVX2; 8445 for (uint32_t n = 1; n <= 16; n++) { 8446 GemmMicrokernelTester() 8447 .mr(1) 8448 .nr(16) 8449 .kr(1) 8450 .sr(1) 8451 .m(1) 8452 .n(n) 8453 .k(1) 8454 .iterations(1) 8455 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8456 } 8457 } 8458 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,k_gt_1)8459 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, k_gt_1) { 8460 TEST_REQUIRES_X86_AVX2; 8461 for (size_t k = 2; k < 10; k++) { 8462 GemmMicrokernelTester() 8463 .mr(1) 8464 .nr(16) 8465 .kr(1) 8466 .sr(1) 8467 .m(1) 8468 .n(16) 8469 .k(k) 8470 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8471 } 8472 } 8473 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,k_gt_1_subtile)8474 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, k_gt_1_subtile) { 8475 TEST_REQUIRES_X86_AVX2; 8476 for (size_t k = 2; k < 10; k++) { 8477 for (uint32_t n = 1; n <= 16; n++) { 8478 for (uint32_t m = 1; m <= 1; m++) { 8479 GemmMicrokernelTester() 8480 .mr(1) 8481 .nr(16) 8482 .kr(1) 8483 .sr(1) 8484 .m(m) 8485 .n(n) 8486 .k(k) 8487 .iterations(1) 8488 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8489 } 8490 } 8491 } 8492 } 8493 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,n_gt_16)8494 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, n_gt_16) { 8495 TEST_REQUIRES_X86_AVX2; 8496 for (uint32_t n = 17; n < 32; n++) { 8497 for (size_t k = 1; k <= 5; k += 2) { 8498 GemmMicrokernelTester() 8499 .mr(1) 8500 .nr(16) 8501 .kr(1) 8502 .sr(1) 8503 .m(1) 8504 .n(n) 8505 .k(k) 8506 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8507 } 8508 } 8509 } 8510 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,n_gt_16_strided_cn)8511 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, n_gt_16_strided_cn) { 8512 TEST_REQUIRES_X86_AVX2; 8513 for (uint32_t n = 17; n < 32; n++) { 8514 for (size_t k = 1; k <= 5; k += 2) { 8515 GemmMicrokernelTester() 8516 .mr(1) 8517 .nr(16) 8518 .kr(1) 8519 .sr(1) 8520 .m(1) 8521 .n(n) 8522 .k(k) 8523 .cn_stride(19) 8524 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8525 } 8526 } 8527 } 8528 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,n_gt_16_subtile)8529 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, n_gt_16_subtile) { 8530 TEST_REQUIRES_X86_AVX2; 8531 for (uint32_t n = 17; n < 32; n++) { 8532 for (size_t k = 1; k <= 5; k += 2) { 8533 for (uint32_t m = 1; m <= 1; m++) { 8534 GemmMicrokernelTester() 8535 .mr(1) 8536 .nr(16) 8537 .kr(1) 8538 .sr(1) 8539 .m(m) 8540 .n(n) 8541 .k(k) 8542 .iterations(1) 8543 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8544 } 8545 } 8546 } 8547 } 8548 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,n_div_16)8549 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, n_div_16) { 8550 TEST_REQUIRES_X86_AVX2; 8551 for (uint32_t n = 32; n <= 48; n += 16) { 8552 for (size_t k = 1; k <= 5; k += 2) { 8553 GemmMicrokernelTester() 8554 .mr(1) 8555 .nr(16) 8556 .kr(1) 8557 .sr(1) 8558 .m(1) 8559 .n(n) 8560 .k(k) 8561 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8562 } 8563 } 8564 } 8565 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,n_div_16_strided_cn)8566 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, n_div_16_strided_cn) { 8567 TEST_REQUIRES_X86_AVX2; 8568 for (uint32_t n = 32; n <= 48; n += 16) { 8569 for (size_t k = 1; k <= 5; k += 2) { 8570 GemmMicrokernelTester() 8571 .mr(1) 8572 .nr(16) 8573 .kr(1) 8574 .sr(1) 8575 .m(1) 8576 .n(n) 8577 .k(k) 8578 .cn_stride(19) 8579 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8580 } 8581 } 8582 } 8583 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,n_div_16_subtile)8584 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, n_div_16_subtile) { 8585 TEST_REQUIRES_X86_AVX2; 8586 for (uint32_t n = 32; n <= 48; n += 16) { 8587 for (size_t k = 1; k <= 5; k += 2) { 8588 for (uint32_t m = 1; m <= 1; m++) { 8589 GemmMicrokernelTester() 8590 .mr(1) 8591 .nr(16) 8592 .kr(1) 8593 .sr(1) 8594 .m(m) 8595 .n(n) 8596 .k(k) 8597 .iterations(1) 8598 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8599 } 8600 } 8601 } 8602 } 8603 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,small_kernel)8604 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, small_kernel) { 8605 TEST_REQUIRES_X86_AVX2; 8606 for (size_t k = 1; k <= 5; k += 2) { 8607 GemmMicrokernelTester() 8608 .mr(1) 8609 .nr(16) 8610 .kr(1) 8611 .sr(1) 8612 .m(1) 8613 .n(16) 8614 .k(k) 8615 .ks(3) 8616 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8617 } 8618 } 8619 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,small_kernel_subtile)8620 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, small_kernel_subtile) { 8621 TEST_REQUIRES_X86_AVX2; 8622 for (size_t k = 1; k <= 5; k += 2) { 8623 for (uint32_t n = 1; n <= 16; n++) { 8624 for (uint32_t m = 1; m <= 1; m++) { 8625 GemmMicrokernelTester() 8626 .mr(1) 8627 .nr(16) 8628 .kr(1) 8629 .sr(1) 8630 .m(m) 8631 .n(n) 8632 .k(k) 8633 .ks(3) 8634 .iterations(1) 8635 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8636 } 8637 } 8638 } 8639 } 8640 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,n_gt_16_small_kernel)8641 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, n_gt_16_small_kernel) { 8642 TEST_REQUIRES_X86_AVX2; 8643 for (uint32_t n = 17; n < 32; n++) { 8644 for (size_t k = 1; k <= 5; k += 2) { 8645 GemmMicrokernelTester() 8646 .mr(1) 8647 .nr(16) 8648 .kr(1) 8649 .sr(1) 8650 .m(1) 8651 .n(n) 8652 .k(k) 8653 .ks(3) 8654 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8655 } 8656 } 8657 } 8658 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,n_div_16_small_kernel)8659 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, n_div_16_small_kernel) { 8660 TEST_REQUIRES_X86_AVX2; 8661 for (uint32_t n = 32; n <= 48; n += 16) { 8662 for (size_t k = 1; k <= 5; k += 2) { 8663 GemmMicrokernelTester() 8664 .mr(1) 8665 .nr(16) 8666 .kr(1) 8667 .sr(1) 8668 .m(1) 8669 .n(n) 8670 .k(k) 8671 .ks(3) 8672 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8673 } 8674 } 8675 } 8676 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,strided_cm_subtile)8677 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, strided_cm_subtile) { 8678 TEST_REQUIRES_X86_AVX2; 8679 for (size_t k = 1; k <= 5; k += 2) { 8680 for (uint32_t n = 1; n <= 16; n++) { 8681 for (uint32_t m = 1; m <= 1; m++) { 8682 GemmMicrokernelTester() 8683 .mr(1) 8684 .nr(16) 8685 .kr(1) 8686 .sr(1) 8687 .m(m) 8688 .n(n) 8689 .k(k) 8690 .cm_stride(19) 8691 .iterations(1) 8692 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8693 } 8694 } 8695 } 8696 } 8697 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,a_offset)8698 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, a_offset) { 8699 TEST_REQUIRES_X86_AVX2; 8700 for (size_t k = 1; k <= 5; k += 2) { 8701 GemmMicrokernelTester() 8702 .mr(1) 8703 .nr(16) 8704 .kr(1) 8705 .sr(1) 8706 .m(1) 8707 .n(16) 8708 .k(k) 8709 .ks(3) 8710 .a_offset(7) 8711 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8712 } 8713 } 8714 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,zero)8715 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, zero) { 8716 TEST_REQUIRES_X86_AVX2; 8717 for (size_t k = 1; k <= 5; k += 2) { 8718 for (uint32_t mz = 0; mz < 1; mz++) { 8719 GemmMicrokernelTester() 8720 .mr(1) 8721 .nr(16) 8722 .kr(1) 8723 .sr(1) 8724 .m(1) 8725 .n(16) 8726 .k(k) 8727 .ks(3) 8728 .a_offset(7) 8729 .zero_index(mz) 8730 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8731 } 8732 } 8733 } 8734 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,qmin)8735 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, qmin) { 8736 TEST_REQUIRES_X86_AVX2; 8737 GemmMicrokernelTester() 8738 .mr(1) 8739 .nr(16) 8740 .kr(1) 8741 .sr(1) 8742 .m(1) 8743 .n(16) 8744 .k(1) 8745 .qmin(128) 8746 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8747 } 8748 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,qmax)8749 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, qmax) { 8750 TEST_REQUIRES_X86_AVX2; 8751 GemmMicrokernelTester() 8752 .mr(1) 8753 .nr(16) 8754 .kr(1) 8755 .sr(1) 8756 .m(1) 8757 .n(16) 8758 .k(1) 8759 .qmax(128) 8760 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8761 } 8762 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST,strided_cm)8763 TEST(F16_IGEMM_MINMAX_1X16__AVX2_BROADCAST, strided_cm) { 8764 TEST_REQUIRES_X86_AVX2; 8765 GemmMicrokernelTester() 8766 .mr(1) 8767 .nr(16) 8768 .kr(1) 8769 .sr(1) 8770 .m(1) 8771 .n(16) 8772 .k(1) 8773 .cm_stride(19) 8774 .Test(xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8775 } 8776 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 8777 8778 8779 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1)8780 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1) { 8781 TEST_REQUIRES_X86_AVX2; 8782 GemmMicrokernelTester() 8783 .mr(3) 8784 .nr(16) 8785 .kr(1) 8786 .sr(1) 8787 .m(3) 8788 .n(16) 8789 .k(1) 8790 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8791 } 8792 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,strided_cn)8793 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, strided_cn) { 8794 TEST_REQUIRES_X86_AVX2; 8795 GemmMicrokernelTester() 8796 .mr(3) 8797 .nr(16) 8798 .kr(1) 8799 .sr(1) 8800 .m(3) 8801 .n(16) 8802 .k(1) 8803 .cn_stride(19) 8804 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8805 } 8806 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1_subtile)8807 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1_subtile) { 8808 TEST_REQUIRES_X86_AVX2; 8809 for (uint32_t n = 1; n <= 16; n++) { 8810 for (uint32_t m = 1; m <= 3; m++) { 8811 GemmMicrokernelTester() 8812 .mr(3) 8813 .nr(16) 8814 .kr(1) 8815 .sr(1) 8816 .m(m) 8817 .n(n) 8818 .k(1) 8819 .iterations(1) 8820 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8821 } 8822 } 8823 } 8824 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1_subtile_m)8825 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1_subtile_m) { 8826 TEST_REQUIRES_X86_AVX2; 8827 for (uint32_t m = 1; m <= 3; m++) { 8828 GemmMicrokernelTester() 8829 .mr(3) 8830 .nr(16) 8831 .kr(1) 8832 .sr(1) 8833 .m(m) 8834 .n(16) 8835 .k(1) 8836 .iterations(1) 8837 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8838 } 8839 } 8840 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,k_eq_1_subtile_n)8841 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, k_eq_1_subtile_n) { 8842 TEST_REQUIRES_X86_AVX2; 8843 for (uint32_t n = 1; n <= 16; n++) { 8844 GemmMicrokernelTester() 8845 .mr(3) 8846 .nr(16) 8847 .kr(1) 8848 .sr(1) 8849 .m(3) 8850 .n(n) 8851 .k(1) 8852 .iterations(1) 8853 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8854 } 8855 } 8856 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,k_gt_1)8857 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, k_gt_1) { 8858 TEST_REQUIRES_X86_AVX2; 8859 for (size_t k = 2; k < 10; k++) { 8860 GemmMicrokernelTester() 8861 .mr(3) 8862 .nr(16) 8863 .kr(1) 8864 .sr(1) 8865 .m(3) 8866 .n(16) 8867 .k(k) 8868 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8869 } 8870 } 8871 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,k_gt_1_subtile)8872 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, k_gt_1_subtile) { 8873 TEST_REQUIRES_X86_AVX2; 8874 for (size_t k = 2; k < 10; k++) { 8875 for (uint32_t n = 1; n <= 16; n++) { 8876 for (uint32_t m = 1; m <= 3; m++) { 8877 GemmMicrokernelTester() 8878 .mr(3) 8879 .nr(16) 8880 .kr(1) 8881 .sr(1) 8882 .m(m) 8883 .n(n) 8884 .k(k) 8885 .iterations(1) 8886 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8887 } 8888 } 8889 } 8890 } 8891 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,n_gt_16)8892 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, n_gt_16) { 8893 TEST_REQUIRES_X86_AVX2; 8894 for (uint32_t n = 17; n < 32; n++) { 8895 for (size_t k = 1; k <= 5; k += 2) { 8896 GemmMicrokernelTester() 8897 .mr(3) 8898 .nr(16) 8899 .kr(1) 8900 .sr(1) 8901 .m(3) 8902 .n(n) 8903 .k(k) 8904 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8905 } 8906 } 8907 } 8908 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,n_gt_16_strided_cn)8909 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, n_gt_16_strided_cn) { 8910 TEST_REQUIRES_X86_AVX2; 8911 for (uint32_t n = 17; n < 32; n++) { 8912 for (size_t k = 1; k <= 5; k += 2) { 8913 GemmMicrokernelTester() 8914 .mr(3) 8915 .nr(16) 8916 .kr(1) 8917 .sr(1) 8918 .m(3) 8919 .n(n) 8920 .k(k) 8921 .cn_stride(19) 8922 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8923 } 8924 } 8925 } 8926 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,n_gt_16_subtile)8927 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, n_gt_16_subtile) { 8928 TEST_REQUIRES_X86_AVX2; 8929 for (uint32_t n = 17; n < 32; n++) { 8930 for (size_t k = 1; k <= 5; k += 2) { 8931 for (uint32_t m = 1; m <= 3; m++) { 8932 GemmMicrokernelTester() 8933 .mr(3) 8934 .nr(16) 8935 .kr(1) 8936 .sr(1) 8937 .m(m) 8938 .n(n) 8939 .k(k) 8940 .iterations(1) 8941 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8942 } 8943 } 8944 } 8945 } 8946 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,n_div_16)8947 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, n_div_16) { 8948 TEST_REQUIRES_X86_AVX2; 8949 for (uint32_t n = 32; n <= 48; n += 16) { 8950 for (size_t k = 1; k <= 5; k += 2) { 8951 GemmMicrokernelTester() 8952 .mr(3) 8953 .nr(16) 8954 .kr(1) 8955 .sr(1) 8956 .m(3) 8957 .n(n) 8958 .k(k) 8959 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8960 } 8961 } 8962 } 8963 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,n_div_16_strided_cn)8964 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, n_div_16_strided_cn) { 8965 TEST_REQUIRES_X86_AVX2; 8966 for (uint32_t n = 32; n <= 48; n += 16) { 8967 for (size_t k = 1; k <= 5; k += 2) { 8968 GemmMicrokernelTester() 8969 .mr(3) 8970 .nr(16) 8971 .kr(1) 8972 .sr(1) 8973 .m(3) 8974 .n(n) 8975 .k(k) 8976 .cn_stride(19) 8977 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8978 } 8979 } 8980 } 8981 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,n_div_16_subtile)8982 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, n_div_16_subtile) { 8983 TEST_REQUIRES_X86_AVX2; 8984 for (uint32_t n = 32; n <= 48; n += 16) { 8985 for (size_t k = 1; k <= 5; k += 2) { 8986 for (uint32_t m = 1; m <= 3; m++) { 8987 GemmMicrokernelTester() 8988 .mr(3) 8989 .nr(16) 8990 .kr(1) 8991 .sr(1) 8992 .m(m) 8993 .n(n) 8994 .k(k) 8995 .iterations(1) 8996 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 8997 } 8998 } 8999 } 9000 } 9001 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,small_kernel)9002 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, small_kernel) { 9003 TEST_REQUIRES_X86_AVX2; 9004 for (size_t k = 1; k <= 5; k += 2) { 9005 GemmMicrokernelTester() 9006 .mr(3) 9007 .nr(16) 9008 .kr(1) 9009 .sr(1) 9010 .m(3) 9011 .n(16) 9012 .k(k) 9013 .ks(3) 9014 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9015 } 9016 } 9017 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,small_kernel_subtile)9018 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, small_kernel_subtile) { 9019 TEST_REQUIRES_X86_AVX2; 9020 for (size_t k = 1; k <= 5; k += 2) { 9021 for (uint32_t n = 1; n <= 16; n++) { 9022 for (uint32_t m = 1; m <= 3; m++) { 9023 GemmMicrokernelTester() 9024 .mr(3) 9025 .nr(16) 9026 .kr(1) 9027 .sr(1) 9028 .m(m) 9029 .n(n) 9030 .k(k) 9031 .ks(3) 9032 .iterations(1) 9033 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9034 } 9035 } 9036 } 9037 } 9038 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,n_gt_16_small_kernel)9039 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, n_gt_16_small_kernel) { 9040 TEST_REQUIRES_X86_AVX2; 9041 for (uint32_t n = 17; n < 32; n++) { 9042 for (size_t k = 1; k <= 5; k += 2) { 9043 GemmMicrokernelTester() 9044 .mr(3) 9045 .nr(16) 9046 .kr(1) 9047 .sr(1) 9048 .m(3) 9049 .n(n) 9050 .k(k) 9051 .ks(3) 9052 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9053 } 9054 } 9055 } 9056 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,n_div_16_small_kernel)9057 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, n_div_16_small_kernel) { 9058 TEST_REQUIRES_X86_AVX2; 9059 for (uint32_t n = 32; n <= 48; n += 16) { 9060 for (size_t k = 1; k <= 5; k += 2) { 9061 GemmMicrokernelTester() 9062 .mr(3) 9063 .nr(16) 9064 .kr(1) 9065 .sr(1) 9066 .m(3) 9067 .n(n) 9068 .k(k) 9069 .ks(3) 9070 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9071 } 9072 } 9073 } 9074 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,strided_cm_subtile)9075 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, strided_cm_subtile) { 9076 TEST_REQUIRES_X86_AVX2; 9077 for (size_t k = 1; k <= 5; k += 2) { 9078 for (uint32_t n = 1; n <= 16; n++) { 9079 for (uint32_t m = 1; m <= 3; m++) { 9080 GemmMicrokernelTester() 9081 .mr(3) 9082 .nr(16) 9083 .kr(1) 9084 .sr(1) 9085 .m(m) 9086 .n(n) 9087 .k(k) 9088 .cm_stride(19) 9089 .iterations(1) 9090 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9091 } 9092 } 9093 } 9094 } 9095 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,a_offset)9096 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, a_offset) { 9097 TEST_REQUIRES_X86_AVX2; 9098 for (size_t k = 1; k <= 5; k += 2) { 9099 GemmMicrokernelTester() 9100 .mr(3) 9101 .nr(16) 9102 .kr(1) 9103 .sr(1) 9104 .m(3) 9105 .n(16) 9106 .k(k) 9107 .ks(3) 9108 .a_offset(17) 9109 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9110 } 9111 } 9112 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,zero)9113 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, zero) { 9114 TEST_REQUIRES_X86_AVX2; 9115 for (size_t k = 1; k <= 5; k += 2) { 9116 for (uint32_t mz = 0; mz < 3; mz++) { 9117 GemmMicrokernelTester() 9118 .mr(3) 9119 .nr(16) 9120 .kr(1) 9121 .sr(1) 9122 .m(3) 9123 .n(16) 9124 .k(k) 9125 .ks(3) 9126 .a_offset(17) 9127 .zero_index(mz) 9128 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9129 } 9130 } 9131 } 9132 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,qmin)9133 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, qmin) { 9134 TEST_REQUIRES_X86_AVX2; 9135 GemmMicrokernelTester() 9136 .mr(3) 9137 .nr(16) 9138 .kr(1) 9139 .sr(1) 9140 .m(3) 9141 .n(16) 9142 .k(1) 9143 .qmin(128) 9144 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9145 } 9146 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,qmax)9147 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, qmax) { 9148 TEST_REQUIRES_X86_AVX2; 9149 GemmMicrokernelTester() 9150 .mr(3) 9151 .nr(16) 9152 .kr(1) 9153 .sr(1) 9154 .m(3) 9155 .n(16) 9156 .k(1) 9157 .qmax(128) 9158 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9159 } 9160 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST,strided_cm)9161 TEST(F16_IGEMM_MINMAX_3X16__AVX2_BROADCAST, strided_cm) { 9162 TEST_REQUIRES_X86_AVX2; 9163 GemmMicrokernelTester() 9164 .mr(3) 9165 .nr(16) 9166 .kr(1) 9167 .sr(1) 9168 .m(3) 9169 .n(16) 9170 .k(1) 9171 .cm_stride(19) 9172 .Test(xnn_f16_igemm_minmax_ukernel_3x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9173 } 9174 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 9175 9176 9177 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1)9178 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1) { 9179 TEST_REQUIRES_X86_AVX2; 9180 GemmMicrokernelTester() 9181 .mr(4) 9182 .nr(8) 9183 .kr(1) 9184 .sr(1) 9185 .m(4) 9186 .n(8) 9187 .k(1) 9188 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9189 } 9190 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,strided_cn)9191 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, strided_cn) { 9192 TEST_REQUIRES_X86_AVX2; 9193 GemmMicrokernelTester() 9194 .mr(4) 9195 .nr(8) 9196 .kr(1) 9197 .sr(1) 9198 .m(4) 9199 .n(8) 9200 .k(1) 9201 .cn_stride(11) 9202 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9203 } 9204 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1_subtile)9205 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1_subtile) { 9206 TEST_REQUIRES_X86_AVX2; 9207 for (uint32_t n = 1; n <= 8; n++) { 9208 for (uint32_t m = 1; m <= 4; m++) { 9209 GemmMicrokernelTester() 9210 .mr(4) 9211 .nr(8) 9212 .kr(1) 9213 .sr(1) 9214 .m(m) 9215 .n(n) 9216 .k(1) 9217 .iterations(1) 9218 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9219 } 9220 } 9221 } 9222 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1_subtile_m)9223 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 9224 TEST_REQUIRES_X86_AVX2; 9225 for (uint32_t m = 1; m <= 4; m++) { 9226 GemmMicrokernelTester() 9227 .mr(4) 9228 .nr(8) 9229 .kr(1) 9230 .sr(1) 9231 .m(m) 9232 .n(8) 9233 .k(1) 9234 .iterations(1) 9235 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9236 } 9237 } 9238 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,k_eq_1_subtile_n)9239 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 9240 TEST_REQUIRES_X86_AVX2; 9241 for (uint32_t n = 1; n <= 8; n++) { 9242 GemmMicrokernelTester() 9243 .mr(4) 9244 .nr(8) 9245 .kr(1) 9246 .sr(1) 9247 .m(4) 9248 .n(n) 9249 .k(1) 9250 .iterations(1) 9251 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9252 } 9253 } 9254 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,k_gt_1)9255 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, k_gt_1) { 9256 TEST_REQUIRES_X86_AVX2; 9257 for (size_t k = 2; k < 10; k++) { 9258 GemmMicrokernelTester() 9259 .mr(4) 9260 .nr(8) 9261 .kr(1) 9262 .sr(1) 9263 .m(4) 9264 .n(8) 9265 .k(k) 9266 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9267 } 9268 } 9269 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,k_gt_1_subtile)9270 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, k_gt_1_subtile) { 9271 TEST_REQUIRES_X86_AVX2; 9272 for (size_t k = 2; k < 10; k++) { 9273 for (uint32_t n = 1; n <= 8; n++) { 9274 for (uint32_t m = 1; m <= 4; m++) { 9275 GemmMicrokernelTester() 9276 .mr(4) 9277 .nr(8) 9278 .kr(1) 9279 .sr(1) 9280 .m(m) 9281 .n(n) 9282 .k(k) 9283 .iterations(1) 9284 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9285 } 9286 } 9287 } 9288 } 9289 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,n_gt_8)9290 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, n_gt_8) { 9291 TEST_REQUIRES_X86_AVX2; 9292 for (uint32_t n = 9; n < 16; n++) { 9293 for (size_t k = 1; k <= 5; k += 2) { 9294 GemmMicrokernelTester() 9295 .mr(4) 9296 .nr(8) 9297 .kr(1) 9298 .sr(1) 9299 .m(4) 9300 .n(n) 9301 .k(k) 9302 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9303 } 9304 } 9305 } 9306 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,n_gt_8_strided_cn)9307 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 9308 TEST_REQUIRES_X86_AVX2; 9309 for (uint32_t n = 9; n < 16; n++) { 9310 for (size_t k = 1; k <= 5; k += 2) { 9311 GemmMicrokernelTester() 9312 .mr(4) 9313 .nr(8) 9314 .kr(1) 9315 .sr(1) 9316 .m(4) 9317 .n(n) 9318 .k(k) 9319 .cn_stride(11) 9320 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9321 } 9322 } 9323 } 9324 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,n_gt_8_subtile)9325 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, n_gt_8_subtile) { 9326 TEST_REQUIRES_X86_AVX2; 9327 for (uint32_t n = 9; n < 16; n++) { 9328 for (size_t k = 1; k <= 5; k += 2) { 9329 for (uint32_t m = 1; m <= 4; m++) { 9330 GemmMicrokernelTester() 9331 .mr(4) 9332 .nr(8) 9333 .kr(1) 9334 .sr(1) 9335 .m(m) 9336 .n(n) 9337 .k(k) 9338 .iterations(1) 9339 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9340 } 9341 } 9342 } 9343 } 9344 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,n_div_8)9345 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, n_div_8) { 9346 TEST_REQUIRES_X86_AVX2; 9347 for (uint32_t n = 16; n <= 24; n += 8) { 9348 for (size_t k = 1; k <= 5; k += 2) { 9349 GemmMicrokernelTester() 9350 .mr(4) 9351 .nr(8) 9352 .kr(1) 9353 .sr(1) 9354 .m(4) 9355 .n(n) 9356 .k(k) 9357 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9358 } 9359 } 9360 } 9361 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,n_div_8_strided_cn)9362 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, n_div_8_strided_cn) { 9363 TEST_REQUIRES_X86_AVX2; 9364 for (uint32_t n = 16; n <= 24; n += 8) { 9365 for (size_t k = 1; k <= 5; k += 2) { 9366 GemmMicrokernelTester() 9367 .mr(4) 9368 .nr(8) 9369 .kr(1) 9370 .sr(1) 9371 .m(4) 9372 .n(n) 9373 .k(k) 9374 .cn_stride(11) 9375 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9376 } 9377 } 9378 } 9379 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,n_div_8_subtile)9380 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, n_div_8_subtile) { 9381 TEST_REQUIRES_X86_AVX2; 9382 for (uint32_t n = 16; n <= 24; n += 8) { 9383 for (size_t k = 1; k <= 5; k += 2) { 9384 for (uint32_t m = 1; m <= 4; m++) { 9385 GemmMicrokernelTester() 9386 .mr(4) 9387 .nr(8) 9388 .kr(1) 9389 .sr(1) 9390 .m(m) 9391 .n(n) 9392 .k(k) 9393 .iterations(1) 9394 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9395 } 9396 } 9397 } 9398 } 9399 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,small_kernel)9400 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, small_kernel) { 9401 TEST_REQUIRES_X86_AVX2; 9402 for (size_t k = 1; k <= 5; k += 2) { 9403 GemmMicrokernelTester() 9404 .mr(4) 9405 .nr(8) 9406 .kr(1) 9407 .sr(1) 9408 .m(4) 9409 .n(8) 9410 .k(k) 9411 .ks(3) 9412 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9413 } 9414 } 9415 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,small_kernel_subtile)9416 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, small_kernel_subtile) { 9417 TEST_REQUIRES_X86_AVX2; 9418 for (size_t k = 1; k <= 5; k += 2) { 9419 for (uint32_t n = 1; n <= 8; n++) { 9420 for (uint32_t m = 1; m <= 4; m++) { 9421 GemmMicrokernelTester() 9422 .mr(4) 9423 .nr(8) 9424 .kr(1) 9425 .sr(1) 9426 .m(m) 9427 .n(n) 9428 .k(k) 9429 .ks(3) 9430 .iterations(1) 9431 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9432 } 9433 } 9434 } 9435 } 9436 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,n_gt_8_small_kernel)9437 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, n_gt_8_small_kernel) { 9438 TEST_REQUIRES_X86_AVX2; 9439 for (uint32_t n = 9; n < 16; n++) { 9440 for (size_t k = 1; k <= 5; k += 2) { 9441 GemmMicrokernelTester() 9442 .mr(4) 9443 .nr(8) 9444 .kr(1) 9445 .sr(1) 9446 .m(4) 9447 .n(n) 9448 .k(k) 9449 .ks(3) 9450 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9451 } 9452 } 9453 } 9454 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,n_div_8_small_kernel)9455 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, n_div_8_small_kernel) { 9456 TEST_REQUIRES_X86_AVX2; 9457 for (uint32_t n = 16; n <= 24; n += 8) { 9458 for (size_t k = 1; k <= 5; k += 2) { 9459 GemmMicrokernelTester() 9460 .mr(4) 9461 .nr(8) 9462 .kr(1) 9463 .sr(1) 9464 .m(4) 9465 .n(n) 9466 .k(k) 9467 .ks(3) 9468 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9469 } 9470 } 9471 } 9472 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,strided_cm_subtile)9473 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, strided_cm_subtile) { 9474 TEST_REQUIRES_X86_AVX2; 9475 for (size_t k = 1; k <= 5; k += 2) { 9476 for (uint32_t n = 1; n <= 8; n++) { 9477 for (uint32_t m = 1; m <= 4; m++) { 9478 GemmMicrokernelTester() 9479 .mr(4) 9480 .nr(8) 9481 .kr(1) 9482 .sr(1) 9483 .m(m) 9484 .n(n) 9485 .k(k) 9486 .cm_stride(11) 9487 .iterations(1) 9488 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9489 } 9490 } 9491 } 9492 } 9493 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,a_offset)9494 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, a_offset) { 9495 TEST_REQUIRES_X86_AVX2; 9496 for (size_t k = 1; k <= 5; k += 2) { 9497 GemmMicrokernelTester() 9498 .mr(4) 9499 .nr(8) 9500 .kr(1) 9501 .sr(1) 9502 .m(4) 9503 .n(8) 9504 .k(k) 9505 .ks(3) 9506 .a_offset(23) 9507 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9508 } 9509 } 9510 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,zero)9511 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, zero) { 9512 TEST_REQUIRES_X86_AVX2; 9513 for (size_t k = 1; k <= 5; k += 2) { 9514 for (uint32_t mz = 0; mz < 4; mz++) { 9515 GemmMicrokernelTester() 9516 .mr(4) 9517 .nr(8) 9518 .kr(1) 9519 .sr(1) 9520 .m(4) 9521 .n(8) 9522 .k(k) 9523 .ks(3) 9524 .a_offset(23) 9525 .zero_index(mz) 9526 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9527 } 9528 } 9529 } 9530 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,qmin)9531 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, qmin) { 9532 TEST_REQUIRES_X86_AVX2; 9533 GemmMicrokernelTester() 9534 .mr(4) 9535 .nr(8) 9536 .kr(1) 9537 .sr(1) 9538 .m(4) 9539 .n(8) 9540 .k(1) 9541 .qmin(128) 9542 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9543 } 9544 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,qmax)9545 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, qmax) { 9546 TEST_REQUIRES_X86_AVX2; 9547 GemmMicrokernelTester() 9548 .mr(4) 9549 .nr(8) 9550 .kr(1) 9551 .sr(1) 9552 .m(4) 9553 .n(8) 9554 .k(1) 9555 .qmax(128) 9556 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9557 } 9558 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST,strided_cm)9559 TEST(F16_IGEMM_MINMAX_4X8__AVX2_BROADCAST, strided_cm) { 9560 TEST_REQUIRES_X86_AVX2; 9561 GemmMicrokernelTester() 9562 .mr(4) 9563 .nr(8) 9564 .kr(1) 9565 .sr(1) 9566 .m(4) 9567 .n(8) 9568 .k(1) 9569 .cm_stride(11) 9570 .Test(xnn_f16_igemm_minmax_ukernel_4x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9571 } 9572 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 9573 9574 9575 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1)9576 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1) { 9577 TEST_REQUIRES_X86_AVX2; 9578 GemmMicrokernelTester() 9579 .mr(4) 9580 .nr(16) 9581 .kr(1) 9582 .sr(1) 9583 .m(4) 9584 .n(16) 9585 .k(1) 9586 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9587 } 9588 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,strided_cn)9589 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, strided_cn) { 9590 TEST_REQUIRES_X86_AVX2; 9591 GemmMicrokernelTester() 9592 .mr(4) 9593 .nr(16) 9594 .kr(1) 9595 .sr(1) 9596 .m(4) 9597 .n(16) 9598 .k(1) 9599 .cn_stride(19) 9600 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9601 } 9602 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1_subtile)9603 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1_subtile) { 9604 TEST_REQUIRES_X86_AVX2; 9605 for (uint32_t n = 1; n <= 16; n++) { 9606 for (uint32_t m = 1; m <= 4; m++) { 9607 GemmMicrokernelTester() 9608 .mr(4) 9609 .nr(16) 9610 .kr(1) 9611 .sr(1) 9612 .m(m) 9613 .n(n) 9614 .k(1) 9615 .iterations(1) 9616 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9617 } 9618 } 9619 } 9620 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1_subtile_m)9621 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1_subtile_m) { 9622 TEST_REQUIRES_X86_AVX2; 9623 for (uint32_t m = 1; m <= 4; m++) { 9624 GemmMicrokernelTester() 9625 .mr(4) 9626 .nr(16) 9627 .kr(1) 9628 .sr(1) 9629 .m(m) 9630 .n(16) 9631 .k(1) 9632 .iterations(1) 9633 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9634 } 9635 } 9636 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,k_eq_1_subtile_n)9637 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, k_eq_1_subtile_n) { 9638 TEST_REQUIRES_X86_AVX2; 9639 for (uint32_t n = 1; n <= 16; n++) { 9640 GemmMicrokernelTester() 9641 .mr(4) 9642 .nr(16) 9643 .kr(1) 9644 .sr(1) 9645 .m(4) 9646 .n(n) 9647 .k(1) 9648 .iterations(1) 9649 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9650 } 9651 } 9652 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,k_gt_1)9653 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, k_gt_1) { 9654 TEST_REQUIRES_X86_AVX2; 9655 for (size_t k = 2; k < 10; k++) { 9656 GemmMicrokernelTester() 9657 .mr(4) 9658 .nr(16) 9659 .kr(1) 9660 .sr(1) 9661 .m(4) 9662 .n(16) 9663 .k(k) 9664 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9665 } 9666 } 9667 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,k_gt_1_subtile)9668 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, k_gt_1_subtile) { 9669 TEST_REQUIRES_X86_AVX2; 9670 for (size_t k = 2; k < 10; k++) { 9671 for (uint32_t n = 1; n <= 16; n++) { 9672 for (uint32_t m = 1; m <= 4; m++) { 9673 GemmMicrokernelTester() 9674 .mr(4) 9675 .nr(16) 9676 .kr(1) 9677 .sr(1) 9678 .m(m) 9679 .n(n) 9680 .k(k) 9681 .iterations(1) 9682 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9683 } 9684 } 9685 } 9686 } 9687 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,n_gt_16)9688 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, n_gt_16) { 9689 TEST_REQUIRES_X86_AVX2; 9690 for (uint32_t n = 17; n < 32; n++) { 9691 for (size_t k = 1; k <= 5; k += 2) { 9692 GemmMicrokernelTester() 9693 .mr(4) 9694 .nr(16) 9695 .kr(1) 9696 .sr(1) 9697 .m(4) 9698 .n(n) 9699 .k(k) 9700 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9701 } 9702 } 9703 } 9704 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,n_gt_16_strided_cn)9705 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, n_gt_16_strided_cn) { 9706 TEST_REQUIRES_X86_AVX2; 9707 for (uint32_t n = 17; n < 32; n++) { 9708 for (size_t k = 1; k <= 5; k += 2) { 9709 GemmMicrokernelTester() 9710 .mr(4) 9711 .nr(16) 9712 .kr(1) 9713 .sr(1) 9714 .m(4) 9715 .n(n) 9716 .k(k) 9717 .cn_stride(19) 9718 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9719 } 9720 } 9721 } 9722 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,n_gt_16_subtile)9723 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, n_gt_16_subtile) { 9724 TEST_REQUIRES_X86_AVX2; 9725 for (uint32_t n = 17; n < 32; n++) { 9726 for (size_t k = 1; k <= 5; k += 2) { 9727 for (uint32_t m = 1; m <= 4; m++) { 9728 GemmMicrokernelTester() 9729 .mr(4) 9730 .nr(16) 9731 .kr(1) 9732 .sr(1) 9733 .m(m) 9734 .n(n) 9735 .k(k) 9736 .iterations(1) 9737 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9738 } 9739 } 9740 } 9741 } 9742 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,n_div_16)9743 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, n_div_16) { 9744 TEST_REQUIRES_X86_AVX2; 9745 for (uint32_t n = 32; n <= 48; n += 16) { 9746 for (size_t k = 1; k <= 5; k += 2) { 9747 GemmMicrokernelTester() 9748 .mr(4) 9749 .nr(16) 9750 .kr(1) 9751 .sr(1) 9752 .m(4) 9753 .n(n) 9754 .k(k) 9755 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9756 } 9757 } 9758 } 9759 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,n_div_16_strided_cn)9760 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, n_div_16_strided_cn) { 9761 TEST_REQUIRES_X86_AVX2; 9762 for (uint32_t n = 32; n <= 48; n += 16) { 9763 for (size_t k = 1; k <= 5; k += 2) { 9764 GemmMicrokernelTester() 9765 .mr(4) 9766 .nr(16) 9767 .kr(1) 9768 .sr(1) 9769 .m(4) 9770 .n(n) 9771 .k(k) 9772 .cn_stride(19) 9773 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9774 } 9775 } 9776 } 9777 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,n_div_16_subtile)9778 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, n_div_16_subtile) { 9779 TEST_REQUIRES_X86_AVX2; 9780 for (uint32_t n = 32; n <= 48; n += 16) { 9781 for (size_t k = 1; k <= 5; k += 2) { 9782 for (uint32_t m = 1; m <= 4; m++) { 9783 GemmMicrokernelTester() 9784 .mr(4) 9785 .nr(16) 9786 .kr(1) 9787 .sr(1) 9788 .m(m) 9789 .n(n) 9790 .k(k) 9791 .iterations(1) 9792 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9793 } 9794 } 9795 } 9796 } 9797 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,small_kernel)9798 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, small_kernel) { 9799 TEST_REQUIRES_X86_AVX2; 9800 for (size_t k = 1; k <= 5; k += 2) { 9801 GemmMicrokernelTester() 9802 .mr(4) 9803 .nr(16) 9804 .kr(1) 9805 .sr(1) 9806 .m(4) 9807 .n(16) 9808 .k(k) 9809 .ks(3) 9810 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9811 } 9812 } 9813 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,small_kernel_subtile)9814 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, small_kernel_subtile) { 9815 TEST_REQUIRES_X86_AVX2; 9816 for (size_t k = 1; k <= 5; k += 2) { 9817 for (uint32_t n = 1; n <= 16; n++) { 9818 for (uint32_t m = 1; m <= 4; m++) { 9819 GemmMicrokernelTester() 9820 .mr(4) 9821 .nr(16) 9822 .kr(1) 9823 .sr(1) 9824 .m(m) 9825 .n(n) 9826 .k(k) 9827 .ks(3) 9828 .iterations(1) 9829 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9830 } 9831 } 9832 } 9833 } 9834 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,n_gt_16_small_kernel)9835 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, n_gt_16_small_kernel) { 9836 TEST_REQUIRES_X86_AVX2; 9837 for (uint32_t n = 17; n < 32; n++) { 9838 for (size_t k = 1; k <= 5; k += 2) { 9839 GemmMicrokernelTester() 9840 .mr(4) 9841 .nr(16) 9842 .kr(1) 9843 .sr(1) 9844 .m(4) 9845 .n(n) 9846 .k(k) 9847 .ks(3) 9848 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9849 } 9850 } 9851 } 9852 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,n_div_16_small_kernel)9853 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, n_div_16_small_kernel) { 9854 TEST_REQUIRES_X86_AVX2; 9855 for (uint32_t n = 32; n <= 48; n += 16) { 9856 for (size_t k = 1; k <= 5; k += 2) { 9857 GemmMicrokernelTester() 9858 .mr(4) 9859 .nr(16) 9860 .kr(1) 9861 .sr(1) 9862 .m(4) 9863 .n(n) 9864 .k(k) 9865 .ks(3) 9866 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9867 } 9868 } 9869 } 9870 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,strided_cm_subtile)9871 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, strided_cm_subtile) { 9872 TEST_REQUIRES_X86_AVX2; 9873 for (size_t k = 1; k <= 5; k += 2) { 9874 for (uint32_t n = 1; n <= 16; n++) { 9875 for (uint32_t m = 1; m <= 4; m++) { 9876 GemmMicrokernelTester() 9877 .mr(4) 9878 .nr(16) 9879 .kr(1) 9880 .sr(1) 9881 .m(m) 9882 .n(n) 9883 .k(k) 9884 .cm_stride(19) 9885 .iterations(1) 9886 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9887 } 9888 } 9889 } 9890 } 9891 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,a_offset)9892 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, a_offset) { 9893 TEST_REQUIRES_X86_AVX2; 9894 for (size_t k = 1; k <= 5; k += 2) { 9895 GemmMicrokernelTester() 9896 .mr(4) 9897 .nr(16) 9898 .kr(1) 9899 .sr(1) 9900 .m(4) 9901 .n(16) 9902 .k(k) 9903 .ks(3) 9904 .a_offset(23) 9905 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9906 } 9907 } 9908 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,zero)9909 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, zero) { 9910 TEST_REQUIRES_X86_AVX2; 9911 for (size_t k = 1; k <= 5; k += 2) { 9912 for (uint32_t mz = 0; mz < 4; mz++) { 9913 GemmMicrokernelTester() 9914 .mr(4) 9915 .nr(16) 9916 .kr(1) 9917 .sr(1) 9918 .m(4) 9919 .n(16) 9920 .k(k) 9921 .ks(3) 9922 .a_offset(23) 9923 .zero_index(mz) 9924 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9925 } 9926 } 9927 } 9928 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,qmin)9929 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, qmin) { 9930 TEST_REQUIRES_X86_AVX2; 9931 GemmMicrokernelTester() 9932 .mr(4) 9933 .nr(16) 9934 .kr(1) 9935 .sr(1) 9936 .m(4) 9937 .n(16) 9938 .k(1) 9939 .qmin(128) 9940 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9941 } 9942 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,qmax)9943 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, qmax) { 9944 TEST_REQUIRES_X86_AVX2; 9945 GemmMicrokernelTester() 9946 .mr(4) 9947 .nr(16) 9948 .kr(1) 9949 .sr(1) 9950 .m(4) 9951 .n(16) 9952 .k(1) 9953 .qmax(128) 9954 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9955 } 9956 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST,strided_cm)9957 TEST(F16_IGEMM_MINMAX_4X16__AVX2_BROADCAST, strided_cm) { 9958 TEST_REQUIRES_X86_AVX2; 9959 GemmMicrokernelTester() 9960 .mr(4) 9961 .nr(16) 9962 .kr(1) 9963 .sr(1) 9964 .m(4) 9965 .n(16) 9966 .k(1) 9967 .cm_stride(19) 9968 .Test(xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9969 } 9970 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 9971 9972 9973 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1)9974 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1) { 9975 TEST_REQUIRES_X86_AVX2; 9976 GemmMicrokernelTester() 9977 .mr(5) 9978 .nr(8) 9979 .kr(1) 9980 .sr(1) 9981 .m(5) 9982 .n(8) 9983 .k(1) 9984 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9985 } 9986 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,strided_cn)9987 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, strided_cn) { 9988 TEST_REQUIRES_X86_AVX2; 9989 GemmMicrokernelTester() 9990 .mr(5) 9991 .nr(8) 9992 .kr(1) 9993 .sr(1) 9994 .m(5) 9995 .n(8) 9996 .k(1) 9997 .cn_stride(11) 9998 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 9999 } 10000 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1_subtile)10001 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1_subtile) { 10002 TEST_REQUIRES_X86_AVX2; 10003 for (uint32_t n = 1; n <= 8; n++) { 10004 for (uint32_t m = 1; m <= 5; m++) { 10005 GemmMicrokernelTester() 10006 .mr(5) 10007 .nr(8) 10008 .kr(1) 10009 .sr(1) 10010 .m(m) 10011 .n(n) 10012 .k(1) 10013 .iterations(1) 10014 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10015 } 10016 } 10017 } 10018 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1_subtile_m)10019 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 10020 TEST_REQUIRES_X86_AVX2; 10021 for (uint32_t m = 1; m <= 5; m++) { 10022 GemmMicrokernelTester() 10023 .mr(5) 10024 .nr(8) 10025 .kr(1) 10026 .sr(1) 10027 .m(m) 10028 .n(8) 10029 .k(1) 10030 .iterations(1) 10031 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10032 } 10033 } 10034 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,k_eq_1_subtile_n)10035 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 10036 TEST_REQUIRES_X86_AVX2; 10037 for (uint32_t n = 1; n <= 8; n++) { 10038 GemmMicrokernelTester() 10039 .mr(5) 10040 .nr(8) 10041 .kr(1) 10042 .sr(1) 10043 .m(5) 10044 .n(n) 10045 .k(1) 10046 .iterations(1) 10047 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10048 } 10049 } 10050 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,k_gt_1)10051 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, k_gt_1) { 10052 TEST_REQUIRES_X86_AVX2; 10053 for (size_t k = 2; k < 10; k++) { 10054 GemmMicrokernelTester() 10055 .mr(5) 10056 .nr(8) 10057 .kr(1) 10058 .sr(1) 10059 .m(5) 10060 .n(8) 10061 .k(k) 10062 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10063 } 10064 } 10065 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,k_gt_1_subtile)10066 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, k_gt_1_subtile) { 10067 TEST_REQUIRES_X86_AVX2; 10068 for (size_t k = 2; k < 10; k++) { 10069 for (uint32_t n = 1; n <= 8; n++) { 10070 for (uint32_t m = 1; m <= 5; m++) { 10071 GemmMicrokernelTester() 10072 .mr(5) 10073 .nr(8) 10074 .kr(1) 10075 .sr(1) 10076 .m(m) 10077 .n(n) 10078 .k(k) 10079 .iterations(1) 10080 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10081 } 10082 } 10083 } 10084 } 10085 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,n_gt_8)10086 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, n_gt_8) { 10087 TEST_REQUIRES_X86_AVX2; 10088 for (uint32_t n = 9; n < 16; n++) { 10089 for (size_t k = 1; k <= 5; k += 2) { 10090 GemmMicrokernelTester() 10091 .mr(5) 10092 .nr(8) 10093 .kr(1) 10094 .sr(1) 10095 .m(5) 10096 .n(n) 10097 .k(k) 10098 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10099 } 10100 } 10101 } 10102 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,n_gt_8_strided_cn)10103 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 10104 TEST_REQUIRES_X86_AVX2; 10105 for (uint32_t n = 9; n < 16; n++) { 10106 for (size_t k = 1; k <= 5; k += 2) { 10107 GemmMicrokernelTester() 10108 .mr(5) 10109 .nr(8) 10110 .kr(1) 10111 .sr(1) 10112 .m(5) 10113 .n(n) 10114 .k(k) 10115 .cn_stride(11) 10116 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10117 } 10118 } 10119 } 10120 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,n_gt_8_subtile)10121 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, n_gt_8_subtile) { 10122 TEST_REQUIRES_X86_AVX2; 10123 for (uint32_t n = 9; n < 16; n++) { 10124 for (size_t k = 1; k <= 5; k += 2) { 10125 for (uint32_t m = 1; m <= 5; m++) { 10126 GemmMicrokernelTester() 10127 .mr(5) 10128 .nr(8) 10129 .kr(1) 10130 .sr(1) 10131 .m(m) 10132 .n(n) 10133 .k(k) 10134 .iterations(1) 10135 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10136 } 10137 } 10138 } 10139 } 10140 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,n_div_8)10141 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, n_div_8) { 10142 TEST_REQUIRES_X86_AVX2; 10143 for (uint32_t n = 16; n <= 24; n += 8) { 10144 for (size_t k = 1; k <= 5; k += 2) { 10145 GemmMicrokernelTester() 10146 .mr(5) 10147 .nr(8) 10148 .kr(1) 10149 .sr(1) 10150 .m(5) 10151 .n(n) 10152 .k(k) 10153 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10154 } 10155 } 10156 } 10157 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,n_div_8_strided_cn)10158 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, n_div_8_strided_cn) { 10159 TEST_REQUIRES_X86_AVX2; 10160 for (uint32_t n = 16; n <= 24; n += 8) { 10161 for (size_t k = 1; k <= 5; k += 2) { 10162 GemmMicrokernelTester() 10163 .mr(5) 10164 .nr(8) 10165 .kr(1) 10166 .sr(1) 10167 .m(5) 10168 .n(n) 10169 .k(k) 10170 .cn_stride(11) 10171 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10172 } 10173 } 10174 } 10175 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,n_div_8_subtile)10176 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, n_div_8_subtile) { 10177 TEST_REQUIRES_X86_AVX2; 10178 for (uint32_t n = 16; n <= 24; n += 8) { 10179 for (size_t k = 1; k <= 5; k += 2) { 10180 for (uint32_t m = 1; m <= 5; m++) { 10181 GemmMicrokernelTester() 10182 .mr(5) 10183 .nr(8) 10184 .kr(1) 10185 .sr(1) 10186 .m(m) 10187 .n(n) 10188 .k(k) 10189 .iterations(1) 10190 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10191 } 10192 } 10193 } 10194 } 10195 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,small_kernel)10196 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, small_kernel) { 10197 TEST_REQUIRES_X86_AVX2; 10198 for (size_t k = 1; k <= 5; k += 2) { 10199 GemmMicrokernelTester() 10200 .mr(5) 10201 .nr(8) 10202 .kr(1) 10203 .sr(1) 10204 .m(5) 10205 .n(8) 10206 .k(k) 10207 .ks(3) 10208 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10209 } 10210 } 10211 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,small_kernel_subtile)10212 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, small_kernel_subtile) { 10213 TEST_REQUIRES_X86_AVX2; 10214 for (size_t k = 1; k <= 5; k += 2) { 10215 for (uint32_t n = 1; n <= 8; n++) { 10216 for (uint32_t m = 1; m <= 5; m++) { 10217 GemmMicrokernelTester() 10218 .mr(5) 10219 .nr(8) 10220 .kr(1) 10221 .sr(1) 10222 .m(m) 10223 .n(n) 10224 .k(k) 10225 .ks(3) 10226 .iterations(1) 10227 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10228 } 10229 } 10230 } 10231 } 10232 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,n_gt_8_small_kernel)10233 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, n_gt_8_small_kernel) { 10234 TEST_REQUIRES_X86_AVX2; 10235 for (uint32_t n = 9; n < 16; n++) { 10236 for (size_t k = 1; k <= 5; k += 2) { 10237 GemmMicrokernelTester() 10238 .mr(5) 10239 .nr(8) 10240 .kr(1) 10241 .sr(1) 10242 .m(5) 10243 .n(n) 10244 .k(k) 10245 .ks(3) 10246 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10247 } 10248 } 10249 } 10250 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,n_div_8_small_kernel)10251 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, n_div_8_small_kernel) { 10252 TEST_REQUIRES_X86_AVX2; 10253 for (uint32_t n = 16; n <= 24; n += 8) { 10254 for (size_t k = 1; k <= 5; k += 2) { 10255 GemmMicrokernelTester() 10256 .mr(5) 10257 .nr(8) 10258 .kr(1) 10259 .sr(1) 10260 .m(5) 10261 .n(n) 10262 .k(k) 10263 .ks(3) 10264 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10265 } 10266 } 10267 } 10268 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,strided_cm_subtile)10269 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, strided_cm_subtile) { 10270 TEST_REQUIRES_X86_AVX2; 10271 for (size_t k = 1; k <= 5; k += 2) { 10272 for (uint32_t n = 1; n <= 8; n++) { 10273 for (uint32_t m = 1; m <= 5; m++) { 10274 GemmMicrokernelTester() 10275 .mr(5) 10276 .nr(8) 10277 .kr(1) 10278 .sr(1) 10279 .m(m) 10280 .n(n) 10281 .k(k) 10282 .cm_stride(11) 10283 .iterations(1) 10284 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10285 } 10286 } 10287 } 10288 } 10289 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,a_offset)10290 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, a_offset) { 10291 TEST_REQUIRES_X86_AVX2; 10292 for (size_t k = 1; k <= 5; k += 2) { 10293 GemmMicrokernelTester() 10294 .mr(5) 10295 .nr(8) 10296 .kr(1) 10297 .sr(1) 10298 .m(5) 10299 .n(8) 10300 .k(k) 10301 .ks(3) 10302 .a_offset(29) 10303 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10304 } 10305 } 10306 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,zero)10307 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, zero) { 10308 TEST_REQUIRES_X86_AVX2; 10309 for (size_t k = 1; k <= 5; k += 2) { 10310 for (uint32_t mz = 0; mz < 5; mz++) { 10311 GemmMicrokernelTester() 10312 .mr(5) 10313 .nr(8) 10314 .kr(1) 10315 .sr(1) 10316 .m(5) 10317 .n(8) 10318 .k(k) 10319 .ks(3) 10320 .a_offset(29) 10321 .zero_index(mz) 10322 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10323 } 10324 } 10325 } 10326 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,qmin)10327 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, qmin) { 10328 TEST_REQUIRES_X86_AVX2; 10329 GemmMicrokernelTester() 10330 .mr(5) 10331 .nr(8) 10332 .kr(1) 10333 .sr(1) 10334 .m(5) 10335 .n(8) 10336 .k(1) 10337 .qmin(128) 10338 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10339 } 10340 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,qmax)10341 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, qmax) { 10342 TEST_REQUIRES_X86_AVX2; 10343 GemmMicrokernelTester() 10344 .mr(5) 10345 .nr(8) 10346 .kr(1) 10347 .sr(1) 10348 .m(5) 10349 .n(8) 10350 .k(1) 10351 .qmax(128) 10352 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10353 } 10354 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST,strided_cm)10355 TEST(F16_IGEMM_MINMAX_5X8__AVX2_BROADCAST, strided_cm) { 10356 TEST_REQUIRES_X86_AVX2; 10357 GemmMicrokernelTester() 10358 .mr(5) 10359 .nr(8) 10360 .kr(1) 10361 .sr(1) 10362 .m(5) 10363 .n(8) 10364 .k(1) 10365 .cm_stride(11) 10366 .Test(xnn_f16_igemm_minmax_ukernel_5x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10367 } 10368 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 10369 10370 10371 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1)10372 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1) { 10373 TEST_REQUIRES_X86_AVX2; 10374 GemmMicrokernelTester() 10375 .mr(5) 10376 .nr(16) 10377 .kr(1) 10378 .sr(1) 10379 .m(5) 10380 .n(16) 10381 .k(1) 10382 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10383 } 10384 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,strided_cn)10385 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, strided_cn) { 10386 TEST_REQUIRES_X86_AVX2; 10387 GemmMicrokernelTester() 10388 .mr(5) 10389 .nr(16) 10390 .kr(1) 10391 .sr(1) 10392 .m(5) 10393 .n(16) 10394 .k(1) 10395 .cn_stride(19) 10396 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10397 } 10398 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1_subtile)10399 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1_subtile) { 10400 TEST_REQUIRES_X86_AVX2; 10401 for (uint32_t n = 1; n <= 16; n++) { 10402 for (uint32_t m = 1; m <= 5; m++) { 10403 GemmMicrokernelTester() 10404 .mr(5) 10405 .nr(16) 10406 .kr(1) 10407 .sr(1) 10408 .m(m) 10409 .n(n) 10410 .k(1) 10411 .iterations(1) 10412 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10413 } 10414 } 10415 } 10416 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1_subtile_m)10417 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1_subtile_m) { 10418 TEST_REQUIRES_X86_AVX2; 10419 for (uint32_t m = 1; m <= 5; m++) { 10420 GemmMicrokernelTester() 10421 .mr(5) 10422 .nr(16) 10423 .kr(1) 10424 .sr(1) 10425 .m(m) 10426 .n(16) 10427 .k(1) 10428 .iterations(1) 10429 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10430 } 10431 } 10432 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,k_eq_1_subtile_n)10433 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, k_eq_1_subtile_n) { 10434 TEST_REQUIRES_X86_AVX2; 10435 for (uint32_t n = 1; n <= 16; n++) { 10436 GemmMicrokernelTester() 10437 .mr(5) 10438 .nr(16) 10439 .kr(1) 10440 .sr(1) 10441 .m(5) 10442 .n(n) 10443 .k(1) 10444 .iterations(1) 10445 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10446 } 10447 } 10448 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,k_gt_1)10449 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, k_gt_1) { 10450 TEST_REQUIRES_X86_AVX2; 10451 for (size_t k = 2; k < 10; k++) { 10452 GemmMicrokernelTester() 10453 .mr(5) 10454 .nr(16) 10455 .kr(1) 10456 .sr(1) 10457 .m(5) 10458 .n(16) 10459 .k(k) 10460 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10461 } 10462 } 10463 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,k_gt_1_subtile)10464 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, k_gt_1_subtile) { 10465 TEST_REQUIRES_X86_AVX2; 10466 for (size_t k = 2; k < 10; k++) { 10467 for (uint32_t n = 1; n <= 16; n++) { 10468 for (uint32_t m = 1; m <= 5; m++) { 10469 GemmMicrokernelTester() 10470 .mr(5) 10471 .nr(16) 10472 .kr(1) 10473 .sr(1) 10474 .m(m) 10475 .n(n) 10476 .k(k) 10477 .iterations(1) 10478 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10479 } 10480 } 10481 } 10482 } 10483 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,n_gt_16)10484 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, n_gt_16) { 10485 TEST_REQUIRES_X86_AVX2; 10486 for (uint32_t n = 17; n < 32; n++) { 10487 for (size_t k = 1; k <= 5; k += 2) { 10488 GemmMicrokernelTester() 10489 .mr(5) 10490 .nr(16) 10491 .kr(1) 10492 .sr(1) 10493 .m(5) 10494 .n(n) 10495 .k(k) 10496 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10497 } 10498 } 10499 } 10500 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,n_gt_16_strided_cn)10501 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, n_gt_16_strided_cn) { 10502 TEST_REQUIRES_X86_AVX2; 10503 for (uint32_t n = 17; n < 32; n++) { 10504 for (size_t k = 1; k <= 5; k += 2) { 10505 GemmMicrokernelTester() 10506 .mr(5) 10507 .nr(16) 10508 .kr(1) 10509 .sr(1) 10510 .m(5) 10511 .n(n) 10512 .k(k) 10513 .cn_stride(19) 10514 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10515 } 10516 } 10517 } 10518 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,n_gt_16_subtile)10519 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, n_gt_16_subtile) { 10520 TEST_REQUIRES_X86_AVX2; 10521 for (uint32_t n = 17; n < 32; n++) { 10522 for (size_t k = 1; k <= 5; k += 2) { 10523 for (uint32_t m = 1; m <= 5; m++) { 10524 GemmMicrokernelTester() 10525 .mr(5) 10526 .nr(16) 10527 .kr(1) 10528 .sr(1) 10529 .m(m) 10530 .n(n) 10531 .k(k) 10532 .iterations(1) 10533 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10534 } 10535 } 10536 } 10537 } 10538 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,n_div_16)10539 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, n_div_16) { 10540 TEST_REQUIRES_X86_AVX2; 10541 for (uint32_t n = 32; n <= 48; n += 16) { 10542 for (size_t k = 1; k <= 5; k += 2) { 10543 GemmMicrokernelTester() 10544 .mr(5) 10545 .nr(16) 10546 .kr(1) 10547 .sr(1) 10548 .m(5) 10549 .n(n) 10550 .k(k) 10551 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10552 } 10553 } 10554 } 10555 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,n_div_16_strided_cn)10556 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, n_div_16_strided_cn) { 10557 TEST_REQUIRES_X86_AVX2; 10558 for (uint32_t n = 32; n <= 48; n += 16) { 10559 for (size_t k = 1; k <= 5; k += 2) { 10560 GemmMicrokernelTester() 10561 .mr(5) 10562 .nr(16) 10563 .kr(1) 10564 .sr(1) 10565 .m(5) 10566 .n(n) 10567 .k(k) 10568 .cn_stride(19) 10569 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10570 } 10571 } 10572 } 10573 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,n_div_16_subtile)10574 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, n_div_16_subtile) { 10575 TEST_REQUIRES_X86_AVX2; 10576 for (uint32_t n = 32; n <= 48; n += 16) { 10577 for (size_t k = 1; k <= 5; k += 2) { 10578 for (uint32_t m = 1; m <= 5; m++) { 10579 GemmMicrokernelTester() 10580 .mr(5) 10581 .nr(16) 10582 .kr(1) 10583 .sr(1) 10584 .m(m) 10585 .n(n) 10586 .k(k) 10587 .iterations(1) 10588 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10589 } 10590 } 10591 } 10592 } 10593 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,small_kernel)10594 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, small_kernel) { 10595 TEST_REQUIRES_X86_AVX2; 10596 for (size_t k = 1; k <= 5; k += 2) { 10597 GemmMicrokernelTester() 10598 .mr(5) 10599 .nr(16) 10600 .kr(1) 10601 .sr(1) 10602 .m(5) 10603 .n(16) 10604 .k(k) 10605 .ks(3) 10606 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10607 } 10608 } 10609 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,small_kernel_subtile)10610 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, small_kernel_subtile) { 10611 TEST_REQUIRES_X86_AVX2; 10612 for (size_t k = 1; k <= 5; k += 2) { 10613 for (uint32_t n = 1; n <= 16; n++) { 10614 for (uint32_t m = 1; m <= 5; m++) { 10615 GemmMicrokernelTester() 10616 .mr(5) 10617 .nr(16) 10618 .kr(1) 10619 .sr(1) 10620 .m(m) 10621 .n(n) 10622 .k(k) 10623 .ks(3) 10624 .iterations(1) 10625 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10626 } 10627 } 10628 } 10629 } 10630 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,n_gt_16_small_kernel)10631 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, n_gt_16_small_kernel) { 10632 TEST_REQUIRES_X86_AVX2; 10633 for (uint32_t n = 17; n < 32; n++) { 10634 for (size_t k = 1; k <= 5; k += 2) { 10635 GemmMicrokernelTester() 10636 .mr(5) 10637 .nr(16) 10638 .kr(1) 10639 .sr(1) 10640 .m(5) 10641 .n(n) 10642 .k(k) 10643 .ks(3) 10644 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10645 } 10646 } 10647 } 10648 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,n_div_16_small_kernel)10649 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, n_div_16_small_kernel) { 10650 TEST_REQUIRES_X86_AVX2; 10651 for (uint32_t n = 32; n <= 48; n += 16) { 10652 for (size_t k = 1; k <= 5; k += 2) { 10653 GemmMicrokernelTester() 10654 .mr(5) 10655 .nr(16) 10656 .kr(1) 10657 .sr(1) 10658 .m(5) 10659 .n(n) 10660 .k(k) 10661 .ks(3) 10662 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10663 } 10664 } 10665 } 10666 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,strided_cm_subtile)10667 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, strided_cm_subtile) { 10668 TEST_REQUIRES_X86_AVX2; 10669 for (size_t k = 1; k <= 5; k += 2) { 10670 for (uint32_t n = 1; n <= 16; n++) { 10671 for (uint32_t m = 1; m <= 5; m++) { 10672 GemmMicrokernelTester() 10673 .mr(5) 10674 .nr(16) 10675 .kr(1) 10676 .sr(1) 10677 .m(m) 10678 .n(n) 10679 .k(k) 10680 .cm_stride(19) 10681 .iterations(1) 10682 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10683 } 10684 } 10685 } 10686 } 10687 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,a_offset)10688 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, a_offset) { 10689 TEST_REQUIRES_X86_AVX2; 10690 for (size_t k = 1; k <= 5; k += 2) { 10691 GemmMicrokernelTester() 10692 .mr(5) 10693 .nr(16) 10694 .kr(1) 10695 .sr(1) 10696 .m(5) 10697 .n(16) 10698 .k(k) 10699 .ks(3) 10700 .a_offset(29) 10701 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10702 } 10703 } 10704 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,zero)10705 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, zero) { 10706 TEST_REQUIRES_X86_AVX2; 10707 for (size_t k = 1; k <= 5; k += 2) { 10708 for (uint32_t mz = 0; mz < 5; mz++) { 10709 GemmMicrokernelTester() 10710 .mr(5) 10711 .nr(16) 10712 .kr(1) 10713 .sr(1) 10714 .m(5) 10715 .n(16) 10716 .k(k) 10717 .ks(3) 10718 .a_offset(29) 10719 .zero_index(mz) 10720 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10721 } 10722 } 10723 } 10724 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,qmin)10725 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, qmin) { 10726 TEST_REQUIRES_X86_AVX2; 10727 GemmMicrokernelTester() 10728 .mr(5) 10729 .nr(16) 10730 .kr(1) 10731 .sr(1) 10732 .m(5) 10733 .n(16) 10734 .k(1) 10735 .qmin(128) 10736 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10737 } 10738 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,qmax)10739 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, qmax) { 10740 TEST_REQUIRES_X86_AVX2; 10741 GemmMicrokernelTester() 10742 .mr(5) 10743 .nr(16) 10744 .kr(1) 10745 .sr(1) 10746 .m(5) 10747 .n(16) 10748 .k(1) 10749 .qmax(128) 10750 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10751 } 10752 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST,strided_cm)10753 TEST(F16_IGEMM_MINMAX_5X16__AVX2_BROADCAST, strided_cm) { 10754 TEST_REQUIRES_X86_AVX2; 10755 GemmMicrokernelTester() 10756 .mr(5) 10757 .nr(16) 10758 .kr(1) 10759 .sr(1) 10760 .m(5) 10761 .n(16) 10762 .k(1) 10763 .cm_stride(19) 10764 .Test(xnn_f16_igemm_minmax_ukernel_5x16__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10765 } 10766 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 10767 10768 10769 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1)10770 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1) { 10771 TEST_REQUIRES_X86_AVX2; 10772 GemmMicrokernelTester() 10773 .mr(6) 10774 .nr(8) 10775 .kr(1) 10776 .sr(1) 10777 .m(6) 10778 .n(8) 10779 .k(1) 10780 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10781 } 10782 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,strided_cn)10783 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, strided_cn) { 10784 TEST_REQUIRES_X86_AVX2; 10785 GemmMicrokernelTester() 10786 .mr(6) 10787 .nr(8) 10788 .kr(1) 10789 .sr(1) 10790 .m(6) 10791 .n(8) 10792 .k(1) 10793 .cn_stride(11) 10794 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10795 } 10796 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1_subtile)10797 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1_subtile) { 10798 TEST_REQUIRES_X86_AVX2; 10799 for (uint32_t n = 1; n <= 8; n++) { 10800 for (uint32_t m = 1; m <= 6; m++) { 10801 GemmMicrokernelTester() 10802 .mr(6) 10803 .nr(8) 10804 .kr(1) 10805 .sr(1) 10806 .m(m) 10807 .n(n) 10808 .k(1) 10809 .iterations(1) 10810 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10811 } 10812 } 10813 } 10814 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1_subtile_m)10815 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 10816 TEST_REQUIRES_X86_AVX2; 10817 for (uint32_t m = 1; m <= 6; m++) { 10818 GemmMicrokernelTester() 10819 .mr(6) 10820 .nr(8) 10821 .kr(1) 10822 .sr(1) 10823 .m(m) 10824 .n(8) 10825 .k(1) 10826 .iterations(1) 10827 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10828 } 10829 } 10830 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,k_eq_1_subtile_n)10831 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 10832 TEST_REQUIRES_X86_AVX2; 10833 for (uint32_t n = 1; n <= 8; n++) { 10834 GemmMicrokernelTester() 10835 .mr(6) 10836 .nr(8) 10837 .kr(1) 10838 .sr(1) 10839 .m(6) 10840 .n(n) 10841 .k(1) 10842 .iterations(1) 10843 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10844 } 10845 } 10846 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,k_gt_1)10847 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, k_gt_1) { 10848 TEST_REQUIRES_X86_AVX2; 10849 for (size_t k = 2; k < 10; k++) { 10850 GemmMicrokernelTester() 10851 .mr(6) 10852 .nr(8) 10853 .kr(1) 10854 .sr(1) 10855 .m(6) 10856 .n(8) 10857 .k(k) 10858 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10859 } 10860 } 10861 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,k_gt_1_subtile)10862 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, k_gt_1_subtile) { 10863 TEST_REQUIRES_X86_AVX2; 10864 for (size_t k = 2; k < 10; k++) { 10865 for (uint32_t n = 1; n <= 8; n++) { 10866 for (uint32_t m = 1; m <= 6; m++) { 10867 GemmMicrokernelTester() 10868 .mr(6) 10869 .nr(8) 10870 .kr(1) 10871 .sr(1) 10872 .m(m) 10873 .n(n) 10874 .k(k) 10875 .iterations(1) 10876 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10877 } 10878 } 10879 } 10880 } 10881 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,n_gt_8)10882 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, n_gt_8) { 10883 TEST_REQUIRES_X86_AVX2; 10884 for (uint32_t n = 9; n < 16; n++) { 10885 for (size_t k = 1; k <= 5; k += 2) { 10886 GemmMicrokernelTester() 10887 .mr(6) 10888 .nr(8) 10889 .kr(1) 10890 .sr(1) 10891 .m(6) 10892 .n(n) 10893 .k(k) 10894 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10895 } 10896 } 10897 } 10898 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,n_gt_8_strided_cn)10899 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 10900 TEST_REQUIRES_X86_AVX2; 10901 for (uint32_t n = 9; n < 16; n++) { 10902 for (size_t k = 1; k <= 5; k += 2) { 10903 GemmMicrokernelTester() 10904 .mr(6) 10905 .nr(8) 10906 .kr(1) 10907 .sr(1) 10908 .m(6) 10909 .n(n) 10910 .k(k) 10911 .cn_stride(11) 10912 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10913 } 10914 } 10915 } 10916 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,n_gt_8_subtile)10917 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, n_gt_8_subtile) { 10918 TEST_REQUIRES_X86_AVX2; 10919 for (uint32_t n = 9; n < 16; n++) { 10920 for (size_t k = 1; k <= 5; k += 2) { 10921 for (uint32_t m = 1; m <= 6; m++) { 10922 GemmMicrokernelTester() 10923 .mr(6) 10924 .nr(8) 10925 .kr(1) 10926 .sr(1) 10927 .m(m) 10928 .n(n) 10929 .k(k) 10930 .iterations(1) 10931 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10932 } 10933 } 10934 } 10935 } 10936 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,n_div_8)10937 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, n_div_8) { 10938 TEST_REQUIRES_X86_AVX2; 10939 for (uint32_t n = 16; n <= 24; n += 8) { 10940 for (size_t k = 1; k <= 5; k += 2) { 10941 GemmMicrokernelTester() 10942 .mr(6) 10943 .nr(8) 10944 .kr(1) 10945 .sr(1) 10946 .m(6) 10947 .n(n) 10948 .k(k) 10949 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10950 } 10951 } 10952 } 10953 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,n_div_8_strided_cn)10954 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, n_div_8_strided_cn) { 10955 TEST_REQUIRES_X86_AVX2; 10956 for (uint32_t n = 16; n <= 24; n += 8) { 10957 for (size_t k = 1; k <= 5; k += 2) { 10958 GemmMicrokernelTester() 10959 .mr(6) 10960 .nr(8) 10961 .kr(1) 10962 .sr(1) 10963 .m(6) 10964 .n(n) 10965 .k(k) 10966 .cn_stride(11) 10967 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10968 } 10969 } 10970 } 10971 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,n_div_8_subtile)10972 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, n_div_8_subtile) { 10973 TEST_REQUIRES_X86_AVX2; 10974 for (uint32_t n = 16; n <= 24; n += 8) { 10975 for (size_t k = 1; k <= 5; k += 2) { 10976 for (uint32_t m = 1; m <= 6; m++) { 10977 GemmMicrokernelTester() 10978 .mr(6) 10979 .nr(8) 10980 .kr(1) 10981 .sr(1) 10982 .m(m) 10983 .n(n) 10984 .k(k) 10985 .iterations(1) 10986 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 10987 } 10988 } 10989 } 10990 } 10991 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,small_kernel)10992 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, small_kernel) { 10993 TEST_REQUIRES_X86_AVX2; 10994 for (size_t k = 1; k <= 5; k += 2) { 10995 GemmMicrokernelTester() 10996 .mr(6) 10997 .nr(8) 10998 .kr(1) 10999 .sr(1) 11000 .m(6) 11001 .n(8) 11002 .k(k) 11003 .ks(3) 11004 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11005 } 11006 } 11007 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,small_kernel_subtile)11008 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, small_kernel_subtile) { 11009 TEST_REQUIRES_X86_AVX2; 11010 for (size_t k = 1; k <= 5; k += 2) { 11011 for (uint32_t n = 1; n <= 8; n++) { 11012 for (uint32_t m = 1; m <= 6; m++) { 11013 GemmMicrokernelTester() 11014 .mr(6) 11015 .nr(8) 11016 .kr(1) 11017 .sr(1) 11018 .m(m) 11019 .n(n) 11020 .k(k) 11021 .ks(3) 11022 .iterations(1) 11023 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11024 } 11025 } 11026 } 11027 } 11028 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,n_gt_8_small_kernel)11029 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, n_gt_8_small_kernel) { 11030 TEST_REQUIRES_X86_AVX2; 11031 for (uint32_t n = 9; n < 16; n++) { 11032 for (size_t k = 1; k <= 5; k += 2) { 11033 GemmMicrokernelTester() 11034 .mr(6) 11035 .nr(8) 11036 .kr(1) 11037 .sr(1) 11038 .m(6) 11039 .n(n) 11040 .k(k) 11041 .ks(3) 11042 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11043 } 11044 } 11045 } 11046 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,n_div_8_small_kernel)11047 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, n_div_8_small_kernel) { 11048 TEST_REQUIRES_X86_AVX2; 11049 for (uint32_t n = 16; n <= 24; n += 8) { 11050 for (size_t k = 1; k <= 5; k += 2) { 11051 GemmMicrokernelTester() 11052 .mr(6) 11053 .nr(8) 11054 .kr(1) 11055 .sr(1) 11056 .m(6) 11057 .n(n) 11058 .k(k) 11059 .ks(3) 11060 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11061 } 11062 } 11063 } 11064 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,strided_cm_subtile)11065 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, strided_cm_subtile) { 11066 TEST_REQUIRES_X86_AVX2; 11067 for (size_t k = 1; k <= 5; k += 2) { 11068 for (uint32_t n = 1; n <= 8; n++) { 11069 for (uint32_t m = 1; m <= 6; m++) { 11070 GemmMicrokernelTester() 11071 .mr(6) 11072 .nr(8) 11073 .kr(1) 11074 .sr(1) 11075 .m(m) 11076 .n(n) 11077 .k(k) 11078 .cm_stride(11) 11079 .iterations(1) 11080 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11081 } 11082 } 11083 } 11084 } 11085 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,a_offset)11086 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, a_offset) { 11087 TEST_REQUIRES_X86_AVX2; 11088 for (size_t k = 1; k <= 5; k += 2) { 11089 GemmMicrokernelTester() 11090 .mr(6) 11091 .nr(8) 11092 .kr(1) 11093 .sr(1) 11094 .m(6) 11095 .n(8) 11096 .k(k) 11097 .ks(3) 11098 .a_offset(37) 11099 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11100 } 11101 } 11102 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,zero)11103 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, zero) { 11104 TEST_REQUIRES_X86_AVX2; 11105 for (size_t k = 1; k <= 5; k += 2) { 11106 for (uint32_t mz = 0; mz < 6; mz++) { 11107 GemmMicrokernelTester() 11108 .mr(6) 11109 .nr(8) 11110 .kr(1) 11111 .sr(1) 11112 .m(6) 11113 .n(8) 11114 .k(k) 11115 .ks(3) 11116 .a_offset(37) 11117 .zero_index(mz) 11118 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11119 } 11120 } 11121 } 11122 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,qmin)11123 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, qmin) { 11124 TEST_REQUIRES_X86_AVX2; 11125 GemmMicrokernelTester() 11126 .mr(6) 11127 .nr(8) 11128 .kr(1) 11129 .sr(1) 11130 .m(6) 11131 .n(8) 11132 .k(1) 11133 .qmin(128) 11134 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11135 } 11136 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,qmax)11137 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, qmax) { 11138 TEST_REQUIRES_X86_AVX2; 11139 GemmMicrokernelTester() 11140 .mr(6) 11141 .nr(8) 11142 .kr(1) 11143 .sr(1) 11144 .m(6) 11145 .n(8) 11146 .k(1) 11147 .qmax(128) 11148 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11149 } 11150 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST,strided_cm)11151 TEST(F16_IGEMM_MINMAX_6X8__AVX2_BROADCAST, strided_cm) { 11152 TEST_REQUIRES_X86_AVX2; 11153 GemmMicrokernelTester() 11154 .mr(6) 11155 .nr(8) 11156 .kr(1) 11157 .sr(1) 11158 .m(6) 11159 .n(8) 11160 .k(1) 11161 .cm_stride(11) 11162 .Test(xnn_f16_igemm_minmax_ukernel_6x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11163 } 11164 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 11165 11166 11167 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1)11168 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1) { 11169 TEST_REQUIRES_X86_AVX2; 11170 GemmMicrokernelTester() 11171 .mr(7) 11172 .nr(8) 11173 .kr(1) 11174 .sr(1) 11175 .m(7) 11176 .n(8) 11177 .k(1) 11178 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11179 } 11180 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,strided_cn)11181 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, strided_cn) { 11182 TEST_REQUIRES_X86_AVX2; 11183 GemmMicrokernelTester() 11184 .mr(7) 11185 .nr(8) 11186 .kr(1) 11187 .sr(1) 11188 .m(7) 11189 .n(8) 11190 .k(1) 11191 .cn_stride(11) 11192 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11193 } 11194 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1_subtile)11195 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1_subtile) { 11196 TEST_REQUIRES_X86_AVX2; 11197 for (uint32_t n = 1; n <= 8; n++) { 11198 for (uint32_t m = 1; m <= 7; m++) { 11199 GemmMicrokernelTester() 11200 .mr(7) 11201 .nr(8) 11202 .kr(1) 11203 .sr(1) 11204 .m(m) 11205 .n(n) 11206 .k(1) 11207 .iterations(1) 11208 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11209 } 11210 } 11211 } 11212 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1_subtile_m)11213 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1_subtile_m) { 11214 TEST_REQUIRES_X86_AVX2; 11215 for (uint32_t m = 1; m <= 7; m++) { 11216 GemmMicrokernelTester() 11217 .mr(7) 11218 .nr(8) 11219 .kr(1) 11220 .sr(1) 11221 .m(m) 11222 .n(8) 11223 .k(1) 11224 .iterations(1) 11225 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11226 } 11227 } 11228 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,k_eq_1_subtile_n)11229 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, k_eq_1_subtile_n) { 11230 TEST_REQUIRES_X86_AVX2; 11231 for (uint32_t n = 1; n <= 8; n++) { 11232 GemmMicrokernelTester() 11233 .mr(7) 11234 .nr(8) 11235 .kr(1) 11236 .sr(1) 11237 .m(7) 11238 .n(n) 11239 .k(1) 11240 .iterations(1) 11241 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11242 } 11243 } 11244 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,k_gt_1)11245 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, k_gt_1) { 11246 TEST_REQUIRES_X86_AVX2; 11247 for (size_t k = 2; k < 10; k++) { 11248 GemmMicrokernelTester() 11249 .mr(7) 11250 .nr(8) 11251 .kr(1) 11252 .sr(1) 11253 .m(7) 11254 .n(8) 11255 .k(k) 11256 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11257 } 11258 } 11259 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,k_gt_1_subtile)11260 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, k_gt_1_subtile) { 11261 TEST_REQUIRES_X86_AVX2; 11262 for (size_t k = 2; k < 10; k++) { 11263 for (uint32_t n = 1; n <= 8; n++) { 11264 for (uint32_t m = 1; m <= 7; m++) { 11265 GemmMicrokernelTester() 11266 .mr(7) 11267 .nr(8) 11268 .kr(1) 11269 .sr(1) 11270 .m(m) 11271 .n(n) 11272 .k(k) 11273 .iterations(1) 11274 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11275 } 11276 } 11277 } 11278 } 11279 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,n_gt_8)11280 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, n_gt_8) { 11281 TEST_REQUIRES_X86_AVX2; 11282 for (uint32_t n = 9; n < 16; n++) { 11283 for (size_t k = 1; k <= 5; k += 2) { 11284 GemmMicrokernelTester() 11285 .mr(7) 11286 .nr(8) 11287 .kr(1) 11288 .sr(1) 11289 .m(7) 11290 .n(n) 11291 .k(k) 11292 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11293 } 11294 } 11295 } 11296 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,n_gt_8_strided_cn)11297 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, n_gt_8_strided_cn) { 11298 TEST_REQUIRES_X86_AVX2; 11299 for (uint32_t n = 9; n < 16; n++) { 11300 for (size_t k = 1; k <= 5; k += 2) { 11301 GemmMicrokernelTester() 11302 .mr(7) 11303 .nr(8) 11304 .kr(1) 11305 .sr(1) 11306 .m(7) 11307 .n(n) 11308 .k(k) 11309 .cn_stride(11) 11310 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11311 } 11312 } 11313 } 11314 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,n_gt_8_subtile)11315 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, n_gt_8_subtile) { 11316 TEST_REQUIRES_X86_AVX2; 11317 for (uint32_t n = 9; n < 16; n++) { 11318 for (size_t k = 1; k <= 5; k += 2) { 11319 for (uint32_t m = 1; m <= 7; m++) { 11320 GemmMicrokernelTester() 11321 .mr(7) 11322 .nr(8) 11323 .kr(1) 11324 .sr(1) 11325 .m(m) 11326 .n(n) 11327 .k(k) 11328 .iterations(1) 11329 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11330 } 11331 } 11332 } 11333 } 11334 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,n_div_8)11335 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, n_div_8) { 11336 TEST_REQUIRES_X86_AVX2; 11337 for (uint32_t n = 16; n <= 24; n += 8) { 11338 for (size_t k = 1; k <= 5; k += 2) { 11339 GemmMicrokernelTester() 11340 .mr(7) 11341 .nr(8) 11342 .kr(1) 11343 .sr(1) 11344 .m(7) 11345 .n(n) 11346 .k(k) 11347 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11348 } 11349 } 11350 } 11351 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,n_div_8_strided_cn)11352 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, n_div_8_strided_cn) { 11353 TEST_REQUIRES_X86_AVX2; 11354 for (uint32_t n = 16; n <= 24; n += 8) { 11355 for (size_t k = 1; k <= 5; k += 2) { 11356 GemmMicrokernelTester() 11357 .mr(7) 11358 .nr(8) 11359 .kr(1) 11360 .sr(1) 11361 .m(7) 11362 .n(n) 11363 .k(k) 11364 .cn_stride(11) 11365 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11366 } 11367 } 11368 } 11369 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,n_div_8_subtile)11370 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, n_div_8_subtile) { 11371 TEST_REQUIRES_X86_AVX2; 11372 for (uint32_t n = 16; n <= 24; n += 8) { 11373 for (size_t k = 1; k <= 5; k += 2) { 11374 for (uint32_t m = 1; m <= 7; m++) { 11375 GemmMicrokernelTester() 11376 .mr(7) 11377 .nr(8) 11378 .kr(1) 11379 .sr(1) 11380 .m(m) 11381 .n(n) 11382 .k(k) 11383 .iterations(1) 11384 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11385 } 11386 } 11387 } 11388 } 11389 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,small_kernel)11390 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, small_kernel) { 11391 TEST_REQUIRES_X86_AVX2; 11392 for (size_t k = 1; k <= 5; k += 2) { 11393 GemmMicrokernelTester() 11394 .mr(7) 11395 .nr(8) 11396 .kr(1) 11397 .sr(1) 11398 .m(7) 11399 .n(8) 11400 .k(k) 11401 .ks(3) 11402 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11403 } 11404 } 11405 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,small_kernel_subtile)11406 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, small_kernel_subtile) { 11407 TEST_REQUIRES_X86_AVX2; 11408 for (size_t k = 1; k <= 5; k += 2) { 11409 for (uint32_t n = 1; n <= 8; n++) { 11410 for (uint32_t m = 1; m <= 7; m++) { 11411 GemmMicrokernelTester() 11412 .mr(7) 11413 .nr(8) 11414 .kr(1) 11415 .sr(1) 11416 .m(m) 11417 .n(n) 11418 .k(k) 11419 .ks(3) 11420 .iterations(1) 11421 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11422 } 11423 } 11424 } 11425 } 11426 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,n_gt_8_small_kernel)11427 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, n_gt_8_small_kernel) { 11428 TEST_REQUIRES_X86_AVX2; 11429 for (uint32_t n = 9; n < 16; n++) { 11430 for (size_t k = 1; k <= 5; k += 2) { 11431 GemmMicrokernelTester() 11432 .mr(7) 11433 .nr(8) 11434 .kr(1) 11435 .sr(1) 11436 .m(7) 11437 .n(n) 11438 .k(k) 11439 .ks(3) 11440 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11441 } 11442 } 11443 } 11444 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,n_div_8_small_kernel)11445 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, n_div_8_small_kernel) { 11446 TEST_REQUIRES_X86_AVX2; 11447 for (uint32_t n = 16; n <= 24; n += 8) { 11448 for (size_t k = 1; k <= 5; k += 2) { 11449 GemmMicrokernelTester() 11450 .mr(7) 11451 .nr(8) 11452 .kr(1) 11453 .sr(1) 11454 .m(7) 11455 .n(n) 11456 .k(k) 11457 .ks(3) 11458 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11459 } 11460 } 11461 } 11462 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,strided_cm_subtile)11463 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, strided_cm_subtile) { 11464 TEST_REQUIRES_X86_AVX2; 11465 for (size_t k = 1; k <= 5; k += 2) { 11466 for (uint32_t n = 1; n <= 8; n++) { 11467 for (uint32_t m = 1; m <= 7; m++) { 11468 GemmMicrokernelTester() 11469 .mr(7) 11470 .nr(8) 11471 .kr(1) 11472 .sr(1) 11473 .m(m) 11474 .n(n) 11475 .k(k) 11476 .cm_stride(11) 11477 .iterations(1) 11478 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11479 } 11480 } 11481 } 11482 } 11483 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,a_offset)11484 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, a_offset) { 11485 TEST_REQUIRES_X86_AVX2; 11486 for (size_t k = 1; k <= 5; k += 2) { 11487 GemmMicrokernelTester() 11488 .mr(7) 11489 .nr(8) 11490 .kr(1) 11491 .sr(1) 11492 .m(7) 11493 .n(8) 11494 .k(k) 11495 .ks(3) 11496 .a_offset(37) 11497 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11498 } 11499 } 11500 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,zero)11501 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, zero) { 11502 TEST_REQUIRES_X86_AVX2; 11503 for (size_t k = 1; k <= 5; k += 2) { 11504 for (uint32_t mz = 0; mz < 7; mz++) { 11505 GemmMicrokernelTester() 11506 .mr(7) 11507 .nr(8) 11508 .kr(1) 11509 .sr(1) 11510 .m(7) 11511 .n(8) 11512 .k(k) 11513 .ks(3) 11514 .a_offset(37) 11515 .zero_index(mz) 11516 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11517 } 11518 } 11519 } 11520 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,qmin)11521 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, qmin) { 11522 TEST_REQUIRES_X86_AVX2; 11523 GemmMicrokernelTester() 11524 .mr(7) 11525 .nr(8) 11526 .kr(1) 11527 .sr(1) 11528 .m(7) 11529 .n(8) 11530 .k(1) 11531 .qmin(128) 11532 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11533 } 11534 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,qmax)11535 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, qmax) { 11536 TEST_REQUIRES_X86_AVX2; 11537 GemmMicrokernelTester() 11538 .mr(7) 11539 .nr(8) 11540 .kr(1) 11541 .sr(1) 11542 .m(7) 11543 .n(8) 11544 .k(1) 11545 .qmax(128) 11546 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11547 } 11548 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST,strided_cm)11549 TEST(F16_IGEMM_MINMAX_7X8__AVX2_BROADCAST, strided_cm) { 11550 TEST_REQUIRES_X86_AVX2; 11551 GemmMicrokernelTester() 11552 .mr(7) 11553 .nr(8) 11554 .kr(1) 11555 .sr(1) 11556 .m(7) 11557 .n(8) 11558 .k(1) 11559 .cm_stride(11) 11560 .Test(xnn_f16_igemm_minmax_ukernel_7x8__avx2_broadcast, xnn_init_f16_minmax_avx_params); 11561 } 11562 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 11563