1 // Copyright (c) Facebook, Inc. and its affiliates. 2 // All rights reserved. 3 // 4 // Copyright 2019 Google LLC 5 // 6 // This source code is licensed under the BSD-style license found in the 7 // LICENSE file in the root directory of this source tree. 8 // 9 // Auto-generated file. Do not edit! 10 // Specification: test/qu8-gemm-minmax-rndnu.yaml 11 // Generator: tools/generate-gemm-test.py 12 13 14 #include <gtest/gtest.h> 15 16 #include <xnnpack/allocator.h> 17 #include <xnnpack/common.h> 18 #include <xnnpack/isa-checks.h> 19 20 #include <xnnpack/gemm.h> 21 #include <xnnpack/igemm.h> 22 #include <xnnpack/ppmm.h> 23 #include "gemm-microkernel-tester.h" 24 25 26 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_eq_8)27 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) { 28 TEST_REQUIRES_ARM_NEON; 29 GemmMicrokernelTester() 30 .mr(4) 31 .nr(8) 32 .kr(1) 33 .sr(1) 34 .m(4) 35 .n(8) 36 .k(8) 37 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 38 } 39 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,strided_cn)40 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cn) { 41 TEST_REQUIRES_ARM_NEON; 42 GemmMicrokernelTester() 43 .mr(4) 44 .nr(8) 45 .kr(1) 46 .sr(1) 47 .m(4) 48 .n(8) 49 .k(8) 50 .cn_stride(11) 51 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 52 } 53 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_eq_8_strided_a)54 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) { 55 TEST_REQUIRES_ARM_NEON; 56 GemmMicrokernelTester() 57 .mr(4) 58 .nr(8) 59 .kr(1) 60 .sr(1) 61 .m(4) 62 .n(8) 63 .k(8) 64 .a_stride(11) 65 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 66 } 67 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_eq_8_subtile)68 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) { 69 TEST_REQUIRES_ARM_NEON; 70 for (uint32_t n = 1; n <= 8; n++) { 71 for (uint32_t m = 1; m <= 4; m++) { 72 GemmMicrokernelTester() 73 .mr(4) 74 .nr(8) 75 .kr(1) 76 .sr(1) 77 .m(m) 78 .n(n) 79 .k(8) 80 .iterations(1) 81 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 82 } 83 } 84 } 85 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_eq_8_subtile_m)86 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) { 87 TEST_REQUIRES_ARM_NEON; 88 for (uint32_t m = 1; m <= 4; m++) { 89 GemmMicrokernelTester() 90 .mr(4) 91 .nr(8) 92 .kr(1) 93 .sr(1) 94 .m(m) 95 .n(8) 96 .k(8) 97 .iterations(1) 98 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 99 } 100 } 101 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_eq_8_subtile_n)102 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) { 103 TEST_REQUIRES_ARM_NEON; 104 for (uint32_t n = 1; n <= 8; n++) { 105 GemmMicrokernelTester() 106 .mr(4) 107 .nr(8) 108 .kr(1) 109 .sr(1) 110 .m(4) 111 .n(n) 112 .k(8) 113 .iterations(1) 114 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 115 } 116 } 117 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_lt_8)118 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) { 119 TEST_REQUIRES_ARM_NEON; 120 for (size_t k = 1; k < 8; k++) { 121 GemmMicrokernelTester() 122 .mr(4) 123 .nr(8) 124 .kr(1) 125 .sr(1) 126 .m(4) 127 .n(8) 128 .k(k) 129 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 130 } 131 } 132 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_lt_8_strided_a)133 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) { 134 TEST_REQUIRES_ARM_NEON; 135 for (size_t k = 1; k < 8; k++) { 136 GemmMicrokernelTester() 137 .mr(4) 138 .nr(8) 139 .kr(1) 140 .sr(1) 141 .m(4) 142 .n(8) 143 .k(k) 144 .a_stride(11) 145 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 146 } 147 } 148 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_lt_8_subtile)149 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) { 150 TEST_REQUIRES_ARM_NEON; 151 for (size_t k = 1; k < 8; k++) { 152 for (uint32_t n = 1; n <= 8; n++) { 153 for (uint32_t m = 1; m <= 4; m++) { 154 GemmMicrokernelTester() 155 .mr(4) 156 .nr(8) 157 .kr(1) 158 .sr(1) 159 .m(m) 160 .n(n) 161 .k(k) 162 .iterations(1) 163 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 164 } 165 } 166 } 167 } 168 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_gt_8)169 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) { 170 TEST_REQUIRES_ARM_NEON; 171 for (size_t k = 9; k < 16; k++) { 172 GemmMicrokernelTester() 173 .mr(4) 174 .nr(8) 175 .kr(1) 176 .sr(1) 177 .m(4) 178 .n(8) 179 .k(k) 180 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 181 } 182 } 183 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_gt_8_strided_a)184 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) { 185 TEST_REQUIRES_ARM_NEON; 186 for (size_t k = 9; k < 16; k++) { 187 GemmMicrokernelTester() 188 .mr(4) 189 .nr(8) 190 .kr(1) 191 .sr(1) 192 .m(4) 193 .n(8) 194 .k(k) 195 .a_stride(19) 196 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 197 } 198 } 199 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_gt_8_subtile)200 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) { 201 TEST_REQUIRES_ARM_NEON; 202 for (size_t k = 9; k < 16; k++) { 203 for (uint32_t n = 1; n <= 8; n++) { 204 for (uint32_t m = 1; m <= 4; m++) { 205 GemmMicrokernelTester() 206 .mr(4) 207 .nr(8) 208 .kr(1) 209 .sr(1) 210 .m(m) 211 .n(n) 212 .k(k) 213 .iterations(1) 214 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 215 } 216 } 217 } 218 } 219 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_div_8)220 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8) { 221 TEST_REQUIRES_ARM_NEON; 222 for (size_t k = 16; k <= 80; k += 8) { 223 GemmMicrokernelTester() 224 .mr(4) 225 .nr(8) 226 .kr(1) 227 .sr(1) 228 .m(4) 229 .n(8) 230 .k(k) 231 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 232 } 233 } 234 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_div_8_strided_a)235 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) { 236 TEST_REQUIRES_ARM_NEON; 237 for (size_t k = 16; k <= 80; k += 8) { 238 GemmMicrokernelTester() 239 .mr(4) 240 .nr(8) 241 .kr(1) 242 .sr(1) 243 .m(4) 244 .n(8) 245 .k(k) 246 .a_stride(83) 247 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 248 } 249 } 250 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,k_div_8_subtile)251 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) { 252 TEST_REQUIRES_ARM_NEON; 253 for (size_t k = 16; k <= 80; k += 8) { 254 for (uint32_t n = 1; n <= 8; n++) { 255 for (uint32_t m = 1; m <= 4; m++) { 256 GemmMicrokernelTester() 257 .mr(4) 258 .nr(8) 259 .kr(1) 260 .sr(1) 261 .m(m) 262 .n(n) 263 .k(k) 264 .iterations(1) 265 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 266 } 267 } 268 } 269 } 270 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,n_gt_8)271 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) { 272 TEST_REQUIRES_ARM_NEON; 273 for (uint32_t n = 9; n < 16; n++) { 274 for (size_t k = 1; k <= 40; k += 9) { 275 GemmMicrokernelTester() 276 .mr(4) 277 .nr(8) 278 .kr(1) 279 .sr(1) 280 .m(4) 281 .n(n) 282 .k(k) 283 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 284 } 285 } 286 } 287 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,n_gt_8_strided_cn)288 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) { 289 TEST_REQUIRES_ARM_NEON; 290 for (uint32_t n = 9; n < 16; n++) { 291 for (size_t k = 1; k <= 40; k += 9) { 292 GemmMicrokernelTester() 293 .mr(4) 294 .nr(8) 295 .kr(1) 296 .sr(1) 297 .m(4) 298 .n(n) 299 .k(k) 300 .cn_stride(11) 301 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 302 } 303 } 304 } 305 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,n_gt_8_strided_a)306 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_a) { 307 TEST_REQUIRES_ARM_NEON; 308 for (uint32_t n = 9; n < 16; n++) { 309 for (size_t k = 1; k <= 40; k += 9) { 310 GemmMicrokernelTester() 311 .mr(4) 312 .nr(8) 313 .kr(1) 314 .sr(1) 315 .m(4) 316 .n(n) 317 .k(k) 318 .a_stride(43) 319 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 320 } 321 } 322 } 323 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,n_gt_8_subtile)324 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) { 325 TEST_REQUIRES_ARM_NEON; 326 for (uint32_t n = 9; n < 16; n++) { 327 for (size_t k = 1; k <= 40; k += 9) { 328 for (uint32_t m = 1; m <= 4; m++) { 329 GemmMicrokernelTester() 330 .mr(4) 331 .nr(8) 332 .kr(1) 333 .sr(1) 334 .m(m) 335 .n(n) 336 .k(k) 337 .iterations(1) 338 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 339 } 340 } 341 } 342 } 343 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,n_div_8)344 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8) { 345 TEST_REQUIRES_ARM_NEON; 346 for (uint32_t n = 16; n <= 24; n += 8) { 347 for (size_t k = 1; k <= 40; k += 9) { 348 GemmMicrokernelTester() 349 .mr(4) 350 .nr(8) 351 .kr(1) 352 .sr(1) 353 .m(4) 354 .n(n) 355 .k(k) 356 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 357 } 358 } 359 } 360 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,n_div_8_strided_cn)361 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) { 362 TEST_REQUIRES_ARM_NEON; 363 for (uint32_t n = 16; n <= 24; n += 8) { 364 for (size_t k = 1; k <= 40; k += 9) { 365 GemmMicrokernelTester() 366 .mr(4) 367 .nr(8) 368 .kr(1) 369 .sr(1) 370 .m(4) 371 .n(n) 372 .k(k) 373 .cn_stride(11) 374 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 375 } 376 } 377 } 378 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,n_div_8_strided_a)379 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_a) { 380 TEST_REQUIRES_ARM_NEON; 381 for (uint32_t n = 16; n <= 24; n += 8) { 382 for (size_t k = 1; k <= 40; k += 9) { 383 GemmMicrokernelTester() 384 .mr(4) 385 .nr(8) 386 .kr(1) 387 .sr(1) 388 .m(4) 389 .n(n) 390 .k(k) 391 .a_stride(43) 392 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 393 } 394 } 395 } 396 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,n_div_8_subtile)397 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) { 398 TEST_REQUIRES_ARM_NEON; 399 for (uint32_t n = 16; n <= 24; n += 8) { 400 for (size_t k = 1; k <= 40; k += 9) { 401 for (uint32_t m = 1; m <= 4; m++) { 402 GemmMicrokernelTester() 403 .mr(4) 404 .nr(8) 405 .kr(1) 406 .sr(1) 407 .m(m) 408 .n(n) 409 .k(k) 410 .iterations(1) 411 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 412 } 413 } 414 } 415 } 416 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,strided_cm_subtile)417 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) { 418 TEST_REQUIRES_ARM_NEON; 419 for (size_t k = 1; k <= 40; k += 9) { 420 for (uint32_t n = 1; n <= 8; n++) { 421 for (uint32_t m = 1; m <= 4; m++) { 422 GemmMicrokernelTester() 423 .mr(4) 424 .nr(8) 425 .kr(1) 426 .sr(1) 427 .m(m) 428 .n(n) 429 .k(k) 430 .cm_stride(11) 431 .iterations(1) 432 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 433 } 434 } 435 } 436 } 437 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,qmin)438 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, qmin) { 439 TEST_REQUIRES_ARM_NEON; 440 GemmMicrokernelTester() 441 .mr(4) 442 .nr(8) 443 .kr(1) 444 .sr(1) 445 .m(4) 446 .n(8) 447 .k(8) 448 .qmin(128) 449 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 450 } 451 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,qmax)452 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, qmax) { 453 TEST_REQUIRES_ARM_NEON; 454 GemmMicrokernelTester() 455 .mr(4) 456 .nr(8) 457 .kr(1) 458 .sr(1) 459 .m(4) 460 .n(8) 461 .k(8) 462 .qmax(128) 463 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 464 } 465 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,strided_cm)466 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, strided_cm) { 467 TEST_REQUIRES_ARM_NEON; 468 GemmMicrokernelTester() 469 .mr(4) 470 .nr(8) 471 .kr(1) 472 .sr(1) 473 .m(4) 474 .n(8) 475 .k(8) 476 .cm_stride(11) 477 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 478 } 479 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,no_a_zero_point)480 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, no_a_zero_point) { 481 TEST_REQUIRES_ARM_NEON; 482 for (size_t k = 1; k <= 40; k += 9) { 483 GemmMicrokernelTester() 484 .mr(4) 485 .nr(8) 486 .kr(1) 487 .sr(1) 488 .m(4) 489 .n(8) 490 .k(k) 491 .a_zero_point(0) 492 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 493 } 494 } 495 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,no_b_zero_point)496 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, no_b_zero_point) { 497 TEST_REQUIRES_ARM_NEON; 498 for (size_t k = 1; k <= 40; k += 9) { 499 GemmMicrokernelTester() 500 .mr(4) 501 .nr(8) 502 .kr(1) 503 .sr(1) 504 .m(4) 505 .n(8) 506 .k(k) 507 .b_zero_point(0) 508 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 509 } 510 } 511 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64,no_zero_point)512 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_PRFM_LD64, no_zero_point) { 513 TEST_REQUIRES_ARM_NEON; 514 for (size_t k = 1; k <= 40; k += 9) { 515 GemmMicrokernelTester() 516 .mr(4) 517 .nr(8) 518 .kr(1) 519 .sr(1) 520 .m(4) 521 .n(8) 522 .k(k) 523 .a_zero_point(0) 524 .b_zero_point(0) 525 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 526 } 527 } 528 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY 529 530 531 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_eq_8)532 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8) { 533 TEST_REQUIRES_ARM_NEON; 534 GemmMicrokernelTester() 535 .mr(4) 536 .nr(8) 537 .kr(1) 538 .sr(1) 539 .m(4) 540 .n(8) 541 .k(8) 542 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 543 } 544 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,strided_cn)545 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cn) { 546 TEST_REQUIRES_ARM_NEON; 547 GemmMicrokernelTester() 548 .mr(4) 549 .nr(8) 550 .kr(1) 551 .sr(1) 552 .m(4) 553 .n(8) 554 .k(8) 555 .cn_stride(11) 556 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 557 } 558 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_eq_8_strided_a)559 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) { 560 TEST_REQUIRES_ARM_NEON; 561 GemmMicrokernelTester() 562 .mr(4) 563 .nr(8) 564 .kr(1) 565 .sr(1) 566 .m(4) 567 .n(8) 568 .k(8) 569 .a_stride(11) 570 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 571 } 572 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_eq_8_subtile)573 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile) { 574 TEST_REQUIRES_ARM_NEON; 575 for (uint32_t n = 1; n <= 8; n++) { 576 for (uint32_t m = 1; m <= 4; m++) { 577 GemmMicrokernelTester() 578 .mr(4) 579 .nr(8) 580 .kr(1) 581 .sr(1) 582 .m(m) 583 .n(n) 584 .k(8) 585 .iterations(1) 586 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 587 } 588 } 589 } 590 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_eq_8_subtile_m)591 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) { 592 TEST_REQUIRES_ARM_NEON; 593 for (uint32_t m = 1; m <= 4; m++) { 594 GemmMicrokernelTester() 595 .mr(4) 596 .nr(8) 597 .kr(1) 598 .sr(1) 599 .m(m) 600 .n(8) 601 .k(8) 602 .iterations(1) 603 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 604 } 605 } 606 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_eq_8_subtile_n)607 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) { 608 TEST_REQUIRES_ARM_NEON; 609 for (uint32_t n = 1; n <= 8; n++) { 610 GemmMicrokernelTester() 611 .mr(4) 612 .nr(8) 613 .kr(1) 614 .sr(1) 615 .m(4) 616 .n(n) 617 .k(8) 618 .iterations(1) 619 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 620 } 621 } 622 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_lt_8)623 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8) { 624 TEST_REQUIRES_ARM_NEON; 625 for (size_t k = 1; k < 8; k++) { 626 GemmMicrokernelTester() 627 .mr(4) 628 .nr(8) 629 .kr(1) 630 .sr(1) 631 .m(4) 632 .n(8) 633 .k(k) 634 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 635 } 636 } 637 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_lt_8_strided_a)638 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) { 639 TEST_REQUIRES_ARM_NEON; 640 for (size_t k = 1; k < 8; k++) { 641 GemmMicrokernelTester() 642 .mr(4) 643 .nr(8) 644 .kr(1) 645 .sr(1) 646 .m(4) 647 .n(8) 648 .k(k) 649 .a_stride(11) 650 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 651 } 652 } 653 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_lt_8_subtile)654 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_lt_8_subtile) { 655 TEST_REQUIRES_ARM_NEON; 656 for (size_t k = 1; k < 8; k++) { 657 for (uint32_t n = 1; n <= 8; n++) { 658 for (uint32_t m = 1; m <= 4; m++) { 659 GemmMicrokernelTester() 660 .mr(4) 661 .nr(8) 662 .kr(1) 663 .sr(1) 664 .m(m) 665 .n(n) 666 .k(k) 667 .iterations(1) 668 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 669 } 670 } 671 } 672 } 673 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_gt_8)674 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8) { 675 TEST_REQUIRES_ARM_NEON; 676 for (size_t k = 9; k < 16; k++) { 677 GemmMicrokernelTester() 678 .mr(4) 679 .nr(8) 680 .kr(1) 681 .sr(1) 682 .m(4) 683 .n(8) 684 .k(k) 685 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 686 } 687 } 688 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_gt_8_strided_a)689 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) { 690 TEST_REQUIRES_ARM_NEON; 691 for (size_t k = 9; k < 16; k++) { 692 GemmMicrokernelTester() 693 .mr(4) 694 .nr(8) 695 .kr(1) 696 .sr(1) 697 .m(4) 698 .n(8) 699 .k(k) 700 .a_stride(19) 701 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 702 } 703 } 704 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_gt_8_subtile)705 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_gt_8_subtile) { 706 TEST_REQUIRES_ARM_NEON; 707 for (size_t k = 9; k < 16; k++) { 708 for (uint32_t n = 1; n <= 8; n++) { 709 for (uint32_t m = 1; m <= 4; m++) { 710 GemmMicrokernelTester() 711 .mr(4) 712 .nr(8) 713 .kr(1) 714 .sr(1) 715 .m(m) 716 .n(n) 717 .k(k) 718 .iterations(1) 719 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 720 } 721 } 722 } 723 } 724 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_div_8)725 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8) { 726 TEST_REQUIRES_ARM_NEON; 727 for (size_t k = 16; k <= 80; k += 8) { 728 GemmMicrokernelTester() 729 .mr(4) 730 .nr(8) 731 .kr(1) 732 .sr(1) 733 .m(4) 734 .n(8) 735 .k(k) 736 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 737 } 738 } 739 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_div_8_strided_a)740 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8_strided_a) { 741 TEST_REQUIRES_ARM_NEON; 742 for (size_t k = 16; k <= 80; k += 8) { 743 GemmMicrokernelTester() 744 .mr(4) 745 .nr(8) 746 .kr(1) 747 .sr(1) 748 .m(4) 749 .n(8) 750 .k(k) 751 .a_stride(83) 752 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 753 } 754 } 755 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,k_div_8_subtile)756 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, k_div_8_subtile) { 757 TEST_REQUIRES_ARM_NEON; 758 for (size_t k = 16; k <= 80; k += 8) { 759 for (uint32_t n = 1; n <= 8; n++) { 760 for (uint32_t m = 1; m <= 4; m++) { 761 GemmMicrokernelTester() 762 .mr(4) 763 .nr(8) 764 .kr(1) 765 .sr(1) 766 .m(m) 767 .n(n) 768 .k(k) 769 .iterations(1) 770 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 771 } 772 } 773 } 774 } 775 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,n_gt_8)776 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8) { 777 TEST_REQUIRES_ARM_NEON; 778 for (uint32_t n = 9; n < 16; n++) { 779 for (size_t k = 1; k <= 40; k += 9) { 780 GemmMicrokernelTester() 781 .mr(4) 782 .nr(8) 783 .kr(1) 784 .sr(1) 785 .m(4) 786 .n(n) 787 .k(k) 788 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 789 } 790 } 791 } 792 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,n_gt_8_strided_cn)793 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_strided_cn) { 794 TEST_REQUIRES_ARM_NEON; 795 for (uint32_t n = 9; n < 16; n++) { 796 for (size_t k = 1; k <= 40; k += 9) { 797 GemmMicrokernelTester() 798 .mr(4) 799 .nr(8) 800 .kr(1) 801 .sr(1) 802 .m(4) 803 .n(n) 804 .k(k) 805 .cn_stride(11) 806 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 807 } 808 } 809 } 810 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,n_gt_8_strided_a)811 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_strided_a) { 812 TEST_REQUIRES_ARM_NEON; 813 for (uint32_t n = 9; n < 16; n++) { 814 for (size_t k = 1; k <= 40; k += 9) { 815 GemmMicrokernelTester() 816 .mr(4) 817 .nr(8) 818 .kr(1) 819 .sr(1) 820 .m(4) 821 .n(n) 822 .k(k) 823 .a_stride(43) 824 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 825 } 826 } 827 } 828 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,n_gt_8_subtile)829 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_gt_8_subtile) { 830 TEST_REQUIRES_ARM_NEON; 831 for (uint32_t n = 9; n < 16; n++) { 832 for (size_t k = 1; k <= 40; k += 9) { 833 for (uint32_t m = 1; m <= 4; m++) { 834 GemmMicrokernelTester() 835 .mr(4) 836 .nr(8) 837 .kr(1) 838 .sr(1) 839 .m(m) 840 .n(n) 841 .k(k) 842 .iterations(1) 843 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 844 } 845 } 846 } 847 } 848 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,n_div_8)849 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8) { 850 TEST_REQUIRES_ARM_NEON; 851 for (uint32_t n = 16; n <= 24; n += 8) { 852 for (size_t k = 1; k <= 40; k += 9) { 853 GemmMicrokernelTester() 854 .mr(4) 855 .nr(8) 856 .kr(1) 857 .sr(1) 858 .m(4) 859 .n(n) 860 .k(k) 861 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 862 } 863 } 864 } 865 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,n_div_8_strided_cn)866 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_strided_cn) { 867 TEST_REQUIRES_ARM_NEON; 868 for (uint32_t n = 16; n <= 24; n += 8) { 869 for (size_t k = 1; k <= 40; k += 9) { 870 GemmMicrokernelTester() 871 .mr(4) 872 .nr(8) 873 .kr(1) 874 .sr(1) 875 .m(4) 876 .n(n) 877 .k(k) 878 .cn_stride(11) 879 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 880 } 881 } 882 } 883 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,n_div_8_strided_a)884 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_strided_a) { 885 TEST_REQUIRES_ARM_NEON; 886 for (uint32_t n = 16; n <= 24; n += 8) { 887 for (size_t k = 1; k <= 40; k += 9) { 888 GemmMicrokernelTester() 889 .mr(4) 890 .nr(8) 891 .kr(1) 892 .sr(1) 893 .m(4) 894 .n(n) 895 .k(k) 896 .a_stride(43) 897 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 898 } 899 } 900 } 901 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,n_div_8_subtile)902 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, n_div_8_subtile) { 903 TEST_REQUIRES_ARM_NEON; 904 for (uint32_t n = 16; n <= 24; n += 8) { 905 for (size_t k = 1; k <= 40; k += 9) { 906 for (uint32_t m = 1; m <= 4; m++) { 907 GemmMicrokernelTester() 908 .mr(4) 909 .nr(8) 910 .kr(1) 911 .sr(1) 912 .m(m) 913 .n(n) 914 .k(k) 915 .iterations(1) 916 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 917 } 918 } 919 } 920 } 921 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,strided_cm_subtile)922 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cm_subtile) { 923 TEST_REQUIRES_ARM_NEON; 924 for (size_t k = 1; k <= 40; k += 9) { 925 for (uint32_t n = 1; n <= 8; n++) { 926 for (uint32_t m = 1; m <= 4; m++) { 927 GemmMicrokernelTester() 928 .mr(4) 929 .nr(8) 930 .kr(1) 931 .sr(1) 932 .m(m) 933 .n(n) 934 .k(k) 935 .cm_stride(11) 936 .iterations(1) 937 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 938 } 939 } 940 } 941 } 942 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,qmin)943 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, qmin) { 944 TEST_REQUIRES_ARM_NEON; 945 GemmMicrokernelTester() 946 .mr(4) 947 .nr(8) 948 .kr(1) 949 .sr(1) 950 .m(4) 951 .n(8) 952 .k(8) 953 .qmin(128) 954 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 955 } 956 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,qmax)957 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, qmax) { 958 TEST_REQUIRES_ARM_NEON; 959 GemmMicrokernelTester() 960 .mr(4) 961 .nr(8) 962 .kr(1) 963 .sr(1) 964 .m(4) 965 .n(8) 966 .k(8) 967 .qmax(128) 968 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 969 } 970 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,strided_cm)971 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, strided_cm) { 972 TEST_REQUIRES_ARM_NEON; 973 GemmMicrokernelTester() 974 .mr(4) 975 .nr(8) 976 .kr(1) 977 .sr(1) 978 .m(4) 979 .n(8) 980 .k(8) 981 .cm_stride(11) 982 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 983 } 984 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,no_a_zero_point)985 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, no_a_zero_point) { 986 TEST_REQUIRES_ARM_NEON; 987 for (size_t k = 1; k <= 40; k += 9) { 988 GemmMicrokernelTester() 989 .mr(4) 990 .nr(8) 991 .kr(1) 992 .sr(1) 993 .m(4) 994 .n(8) 995 .k(k) 996 .a_zero_point(0) 997 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 998 } 999 } 1000 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,no_b_zero_point)1001 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, no_b_zero_point) { 1002 TEST_REQUIRES_ARM_NEON; 1003 for (size_t k = 1; k <= 40; k += 9) { 1004 GemmMicrokernelTester() 1005 .mr(4) 1006 .nr(8) 1007 .kr(1) 1008 .sr(1) 1009 .m(4) 1010 .n(8) 1011 .k(k) 1012 .b_zero_point(0) 1013 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1014 } 1015 } 1016 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64,no_zero_point)1017 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__AARCH32_NEON_MLAL_LANE_LD64, no_zero_point) { 1018 TEST_REQUIRES_ARM_NEON; 1019 for (size_t k = 1; k <= 40; k += 9) { 1020 GemmMicrokernelTester() 1021 .mr(4) 1022 .nr(8) 1023 .kr(1) 1024 .sr(1) 1025 .m(4) 1026 .n(8) 1027 .k(k) 1028 .a_zero_point(0) 1029 .b_zero_point(0) 1030 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1031 } 1032 } 1033 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY 1034 1035 1036 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_eq_8)1037 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_eq_8) { 1038 TEST_REQUIRES_ARM_NEON; 1039 GemmMicrokernelTester() 1040 .mr(1) 1041 .nr(8) 1042 .kr(1) 1043 .sr(1) 1044 .m(1) 1045 .n(8) 1046 .k(8) 1047 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1048 } 1049 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,strided_cn)1050 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, strided_cn) { 1051 TEST_REQUIRES_ARM_NEON; 1052 GemmMicrokernelTester() 1053 .mr(1) 1054 .nr(8) 1055 .kr(1) 1056 .sr(1) 1057 .m(1) 1058 .n(8) 1059 .k(8) 1060 .cn_stride(11) 1061 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1062 } 1063 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_eq_8_strided_a)1064 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_eq_8_strided_a) { 1065 TEST_REQUIRES_ARM_NEON; 1066 GemmMicrokernelTester() 1067 .mr(1) 1068 .nr(8) 1069 .kr(1) 1070 .sr(1) 1071 .m(1) 1072 .n(8) 1073 .k(8) 1074 .a_stride(11) 1075 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1076 } 1077 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_eq_8_subtile)1078 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_eq_8_subtile) { 1079 TEST_REQUIRES_ARM_NEON; 1080 for (uint32_t n = 1; n <= 8; n++) { 1081 for (uint32_t m = 1; m <= 1; m++) { 1082 GemmMicrokernelTester() 1083 .mr(1) 1084 .nr(8) 1085 .kr(1) 1086 .sr(1) 1087 .m(m) 1088 .n(n) 1089 .k(8) 1090 .iterations(1) 1091 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1092 } 1093 } 1094 } 1095 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_eq_8_subtile_m)1096 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_eq_8_subtile_m) { 1097 TEST_REQUIRES_ARM_NEON; 1098 for (uint32_t m = 1; m <= 1; m++) { 1099 GemmMicrokernelTester() 1100 .mr(1) 1101 .nr(8) 1102 .kr(1) 1103 .sr(1) 1104 .m(m) 1105 .n(8) 1106 .k(8) 1107 .iterations(1) 1108 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1109 } 1110 } 1111 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_eq_8_subtile_n)1112 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_eq_8_subtile_n) { 1113 TEST_REQUIRES_ARM_NEON; 1114 for (uint32_t n = 1; n <= 8; n++) { 1115 GemmMicrokernelTester() 1116 .mr(1) 1117 .nr(8) 1118 .kr(1) 1119 .sr(1) 1120 .m(1) 1121 .n(n) 1122 .k(8) 1123 .iterations(1) 1124 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1125 } 1126 } 1127 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_lt_8)1128 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_lt_8) { 1129 TEST_REQUIRES_ARM_NEON; 1130 for (size_t k = 1; k < 8; k++) { 1131 GemmMicrokernelTester() 1132 .mr(1) 1133 .nr(8) 1134 .kr(1) 1135 .sr(1) 1136 .m(1) 1137 .n(8) 1138 .k(k) 1139 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1140 } 1141 } 1142 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_lt_8_strided_a)1143 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_lt_8_strided_a) { 1144 TEST_REQUIRES_ARM_NEON; 1145 for (size_t k = 1; k < 8; k++) { 1146 GemmMicrokernelTester() 1147 .mr(1) 1148 .nr(8) 1149 .kr(1) 1150 .sr(1) 1151 .m(1) 1152 .n(8) 1153 .k(k) 1154 .a_stride(11) 1155 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1156 } 1157 } 1158 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_lt_8_subtile)1159 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_lt_8_subtile) { 1160 TEST_REQUIRES_ARM_NEON; 1161 for (size_t k = 1; k < 8; k++) { 1162 for (uint32_t n = 1; n <= 8; n++) { 1163 for (uint32_t m = 1; m <= 1; m++) { 1164 GemmMicrokernelTester() 1165 .mr(1) 1166 .nr(8) 1167 .kr(1) 1168 .sr(1) 1169 .m(m) 1170 .n(n) 1171 .k(k) 1172 .iterations(1) 1173 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1174 } 1175 } 1176 } 1177 } 1178 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_gt_8)1179 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_gt_8) { 1180 TEST_REQUIRES_ARM_NEON; 1181 for (size_t k = 9; k < 16; k++) { 1182 GemmMicrokernelTester() 1183 .mr(1) 1184 .nr(8) 1185 .kr(1) 1186 .sr(1) 1187 .m(1) 1188 .n(8) 1189 .k(k) 1190 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1191 } 1192 } 1193 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_gt_8_strided_a)1194 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_gt_8_strided_a) { 1195 TEST_REQUIRES_ARM_NEON; 1196 for (size_t k = 9; k < 16; k++) { 1197 GemmMicrokernelTester() 1198 .mr(1) 1199 .nr(8) 1200 .kr(1) 1201 .sr(1) 1202 .m(1) 1203 .n(8) 1204 .k(k) 1205 .a_stride(19) 1206 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1207 } 1208 } 1209 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_gt_8_subtile)1210 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_gt_8_subtile) { 1211 TEST_REQUIRES_ARM_NEON; 1212 for (size_t k = 9; k < 16; k++) { 1213 for (uint32_t n = 1; n <= 8; n++) { 1214 for (uint32_t m = 1; m <= 1; m++) { 1215 GemmMicrokernelTester() 1216 .mr(1) 1217 .nr(8) 1218 .kr(1) 1219 .sr(1) 1220 .m(m) 1221 .n(n) 1222 .k(k) 1223 .iterations(1) 1224 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1225 } 1226 } 1227 } 1228 } 1229 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_div_8)1230 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_div_8) { 1231 TEST_REQUIRES_ARM_NEON; 1232 for (size_t k = 16; k <= 80; k += 8) { 1233 GemmMicrokernelTester() 1234 .mr(1) 1235 .nr(8) 1236 .kr(1) 1237 .sr(1) 1238 .m(1) 1239 .n(8) 1240 .k(k) 1241 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1242 } 1243 } 1244 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_div_8_strided_a)1245 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_div_8_strided_a) { 1246 TEST_REQUIRES_ARM_NEON; 1247 for (size_t k = 16; k <= 80; k += 8) { 1248 GemmMicrokernelTester() 1249 .mr(1) 1250 .nr(8) 1251 .kr(1) 1252 .sr(1) 1253 .m(1) 1254 .n(8) 1255 .k(k) 1256 .a_stride(83) 1257 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1258 } 1259 } 1260 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,k_div_8_subtile)1261 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, k_div_8_subtile) { 1262 TEST_REQUIRES_ARM_NEON; 1263 for (size_t k = 16; k <= 80; k += 8) { 1264 for (uint32_t n = 1; n <= 8; n++) { 1265 for (uint32_t m = 1; m <= 1; m++) { 1266 GemmMicrokernelTester() 1267 .mr(1) 1268 .nr(8) 1269 .kr(1) 1270 .sr(1) 1271 .m(m) 1272 .n(n) 1273 .k(k) 1274 .iterations(1) 1275 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1276 } 1277 } 1278 } 1279 } 1280 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,n_gt_8)1281 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, n_gt_8) { 1282 TEST_REQUIRES_ARM_NEON; 1283 for (uint32_t n = 9; n < 16; n++) { 1284 for (size_t k = 1; k <= 40; k += 9) { 1285 GemmMicrokernelTester() 1286 .mr(1) 1287 .nr(8) 1288 .kr(1) 1289 .sr(1) 1290 .m(1) 1291 .n(n) 1292 .k(k) 1293 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1294 } 1295 } 1296 } 1297 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,n_gt_8_strided_cn)1298 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, n_gt_8_strided_cn) { 1299 TEST_REQUIRES_ARM_NEON; 1300 for (uint32_t n = 9; n < 16; n++) { 1301 for (size_t k = 1; k <= 40; k += 9) { 1302 GemmMicrokernelTester() 1303 .mr(1) 1304 .nr(8) 1305 .kr(1) 1306 .sr(1) 1307 .m(1) 1308 .n(n) 1309 .k(k) 1310 .cn_stride(11) 1311 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1312 } 1313 } 1314 } 1315 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,n_gt_8_strided_a)1316 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, n_gt_8_strided_a) { 1317 TEST_REQUIRES_ARM_NEON; 1318 for (uint32_t n = 9; n < 16; n++) { 1319 for (size_t k = 1; k <= 40; k += 9) { 1320 GemmMicrokernelTester() 1321 .mr(1) 1322 .nr(8) 1323 .kr(1) 1324 .sr(1) 1325 .m(1) 1326 .n(n) 1327 .k(k) 1328 .a_stride(43) 1329 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1330 } 1331 } 1332 } 1333 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,n_gt_8_subtile)1334 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, n_gt_8_subtile) { 1335 TEST_REQUIRES_ARM_NEON; 1336 for (uint32_t n = 9; n < 16; n++) { 1337 for (size_t k = 1; k <= 40; k += 9) { 1338 for (uint32_t m = 1; m <= 1; m++) { 1339 GemmMicrokernelTester() 1340 .mr(1) 1341 .nr(8) 1342 .kr(1) 1343 .sr(1) 1344 .m(m) 1345 .n(n) 1346 .k(k) 1347 .iterations(1) 1348 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1349 } 1350 } 1351 } 1352 } 1353 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,n_div_8)1354 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, n_div_8) { 1355 TEST_REQUIRES_ARM_NEON; 1356 for (uint32_t n = 16; n <= 24; n += 8) { 1357 for (size_t k = 1; k <= 40; k += 9) { 1358 GemmMicrokernelTester() 1359 .mr(1) 1360 .nr(8) 1361 .kr(1) 1362 .sr(1) 1363 .m(1) 1364 .n(n) 1365 .k(k) 1366 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1367 } 1368 } 1369 } 1370 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,n_div_8_strided_cn)1371 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, n_div_8_strided_cn) { 1372 TEST_REQUIRES_ARM_NEON; 1373 for (uint32_t n = 16; n <= 24; n += 8) { 1374 for (size_t k = 1; k <= 40; k += 9) { 1375 GemmMicrokernelTester() 1376 .mr(1) 1377 .nr(8) 1378 .kr(1) 1379 .sr(1) 1380 .m(1) 1381 .n(n) 1382 .k(k) 1383 .cn_stride(11) 1384 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1385 } 1386 } 1387 } 1388 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,n_div_8_strided_a)1389 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, n_div_8_strided_a) { 1390 TEST_REQUIRES_ARM_NEON; 1391 for (uint32_t n = 16; n <= 24; n += 8) { 1392 for (size_t k = 1; k <= 40; k += 9) { 1393 GemmMicrokernelTester() 1394 .mr(1) 1395 .nr(8) 1396 .kr(1) 1397 .sr(1) 1398 .m(1) 1399 .n(n) 1400 .k(k) 1401 .a_stride(43) 1402 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1403 } 1404 } 1405 } 1406 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,n_div_8_subtile)1407 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, n_div_8_subtile) { 1408 TEST_REQUIRES_ARM_NEON; 1409 for (uint32_t n = 16; n <= 24; n += 8) { 1410 for (size_t k = 1; k <= 40; k += 9) { 1411 for (uint32_t m = 1; m <= 1; m++) { 1412 GemmMicrokernelTester() 1413 .mr(1) 1414 .nr(8) 1415 .kr(1) 1416 .sr(1) 1417 .m(m) 1418 .n(n) 1419 .k(k) 1420 .iterations(1) 1421 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1422 } 1423 } 1424 } 1425 } 1426 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,strided_cm_subtile)1427 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, strided_cm_subtile) { 1428 TEST_REQUIRES_ARM_NEON; 1429 for (size_t k = 1; k <= 40; k += 9) { 1430 for (uint32_t n = 1; n <= 8; n++) { 1431 for (uint32_t m = 1; m <= 1; m++) { 1432 GemmMicrokernelTester() 1433 .mr(1) 1434 .nr(8) 1435 .kr(1) 1436 .sr(1) 1437 .m(m) 1438 .n(n) 1439 .k(k) 1440 .cm_stride(11) 1441 .iterations(1) 1442 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1443 } 1444 } 1445 } 1446 } 1447 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,qmin)1448 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, qmin) { 1449 TEST_REQUIRES_ARM_NEON; 1450 GemmMicrokernelTester() 1451 .mr(1) 1452 .nr(8) 1453 .kr(1) 1454 .sr(1) 1455 .m(1) 1456 .n(8) 1457 .k(8) 1458 .qmin(128) 1459 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1460 } 1461 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,qmax)1462 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, qmax) { 1463 TEST_REQUIRES_ARM_NEON; 1464 GemmMicrokernelTester() 1465 .mr(1) 1466 .nr(8) 1467 .kr(1) 1468 .sr(1) 1469 .m(1) 1470 .n(8) 1471 .k(8) 1472 .qmax(128) 1473 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1474 } 1475 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,strided_cm)1476 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, strided_cm) { 1477 TEST_REQUIRES_ARM_NEON; 1478 GemmMicrokernelTester() 1479 .mr(1) 1480 .nr(8) 1481 .kr(1) 1482 .sr(1) 1483 .m(1) 1484 .n(8) 1485 .k(8) 1486 .cm_stride(11) 1487 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1488 } 1489 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,no_a_zero_point)1490 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, no_a_zero_point) { 1491 TEST_REQUIRES_ARM_NEON; 1492 for (size_t k = 1; k <= 40; k += 9) { 1493 GemmMicrokernelTester() 1494 .mr(1) 1495 .nr(8) 1496 .kr(1) 1497 .sr(1) 1498 .m(1) 1499 .n(8) 1500 .k(k) 1501 .a_zero_point(0) 1502 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1503 } 1504 } 1505 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,no_b_zero_point)1506 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, no_b_zero_point) { 1507 TEST_REQUIRES_ARM_NEON; 1508 for (size_t k = 1; k <= 40; k += 9) { 1509 GemmMicrokernelTester() 1510 .mr(1) 1511 .nr(8) 1512 .kr(1) 1513 .sr(1) 1514 .m(1) 1515 .n(8) 1516 .k(k) 1517 .b_zero_point(0) 1518 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1519 } 1520 } 1521 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE,no_zero_point)1522 TEST(QU8_GEMM_MINMAX_RNDNU_1X8__NEON_MLAL_LANE, no_zero_point) { 1523 TEST_REQUIRES_ARM_NEON; 1524 for (size_t k = 1; k <= 40; k += 9) { 1525 GemmMicrokernelTester() 1526 .mr(1) 1527 .nr(8) 1528 .kr(1) 1529 .sr(1) 1530 .m(1) 1531 .n(8) 1532 .k(k) 1533 .a_zero_point(0) 1534 .b_zero_point(0) 1535 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1536 } 1537 } 1538 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1539 1540 1541 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_eq_8)1542 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8) { 1543 TEST_REQUIRES_ARM_NEON; 1544 GemmMicrokernelTester() 1545 .mr(4) 1546 .nr(8) 1547 .kr(1) 1548 .sr(1) 1549 .m(4) 1550 .n(8) 1551 .k(8) 1552 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1553 } 1554 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,strided_cn)1555 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, strided_cn) { 1556 TEST_REQUIRES_ARM_NEON; 1557 GemmMicrokernelTester() 1558 .mr(4) 1559 .nr(8) 1560 .kr(1) 1561 .sr(1) 1562 .m(4) 1563 .n(8) 1564 .k(8) 1565 .cn_stride(11) 1566 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1567 } 1568 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_eq_8_strided_a)1569 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8_strided_a) { 1570 TEST_REQUIRES_ARM_NEON; 1571 GemmMicrokernelTester() 1572 .mr(4) 1573 .nr(8) 1574 .kr(1) 1575 .sr(1) 1576 .m(4) 1577 .n(8) 1578 .k(8) 1579 .a_stride(11) 1580 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1581 } 1582 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_eq_8_subtile)1583 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8_subtile) { 1584 TEST_REQUIRES_ARM_NEON; 1585 for (uint32_t n = 1; n <= 8; n++) { 1586 for (uint32_t m = 1; m <= 4; m++) { 1587 GemmMicrokernelTester() 1588 .mr(4) 1589 .nr(8) 1590 .kr(1) 1591 .sr(1) 1592 .m(m) 1593 .n(n) 1594 .k(8) 1595 .iterations(1) 1596 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1597 } 1598 } 1599 } 1600 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_eq_8_subtile_m)1601 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8_subtile_m) { 1602 TEST_REQUIRES_ARM_NEON; 1603 for (uint32_t m = 1; m <= 4; m++) { 1604 GemmMicrokernelTester() 1605 .mr(4) 1606 .nr(8) 1607 .kr(1) 1608 .sr(1) 1609 .m(m) 1610 .n(8) 1611 .k(8) 1612 .iterations(1) 1613 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1614 } 1615 } 1616 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_eq_8_subtile_n)1617 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_eq_8_subtile_n) { 1618 TEST_REQUIRES_ARM_NEON; 1619 for (uint32_t n = 1; n <= 8; n++) { 1620 GemmMicrokernelTester() 1621 .mr(4) 1622 .nr(8) 1623 .kr(1) 1624 .sr(1) 1625 .m(4) 1626 .n(n) 1627 .k(8) 1628 .iterations(1) 1629 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1630 } 1631 } 1632 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_lt_8)1633 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_lt_8) { 1634 TEST_REQUIRES_ARM_NEON; 1635 for (size_t k = 1; k < 8; k++) { 1636 GemmMicrokernelTester() 1637 .mr(4) 1638 .nr(8) 1639 .kr(1) 1640 .sr(1) 1641 .m(4) 1642 .n(8) 1643 .k(k) 1644 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1645 } 1646 } 1647 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_lt_8_strided_a)1648 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_lt_8_strided_a) { 1649 TEST_REQUIRES_ARM_NEON; 1650 for (size_t k = 1; k < 8; k++) { 1651 GemmMicrokernelTester() 1652 .mr(4) 1653 .nr(8) 1654 .kr(1) 1655 .sr(1) 1656 .m(4) 1657 .n(8) 1658 .k(k) 1659 .a_stride(11) 1660 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1661 } 1662 } 1663 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_lt_8_subtile)1664 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_lt_8_subtile) { 1665 TEST_REQUIRES_ARM_NEON; 1666 for (size_t k = 1; k < 8; k++) { 1667 for (uint32_t n = 1; n <= 8; n++) { 1668 for (uint32_t m = 1; m <= 4; m++) { 1669 GemmMicrokernelTester() 1670 .mr(4) 1671 .nr(8) 1672 .kr(1) 1673 .sr(1) 1674 .m(m) 1675 .n(n) 1676 .k(k) 1677 .iterations(1) 1678 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1679 } 1680 } 1681 } 1682 } 1683 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_gt_8)1684 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_gt_8) { 1685 TEST_REQUIRES_ARM_NEON; 1686 for (size_t k = 9; k < 16; k++) { 1687 GemmMicrokernelTester() 1688 .mr(4) 1689 .nr(8) 1690 .kr(1) 1691 .sr(1) 1692 .m(4) 1693 .n(8) 1694 .k(k) 1695 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1696 } 1697 } 1698 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_gt_8_strided_a)1699 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_gt_8_strided_a) { 1700 TEST_REQUIRES_ARM_NEON; 1701 for (size_t k = 9; k < 16; k++) { 1702 GemmMicrokernelTester() 1703 .mr(4) 1704 .nr(8) 1705 .kr(1) 1706 .sr(1) 1707 .m(4) 1708 .n(8) 1709 .k(k) 1710 .a_stride(19) 1711 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1712 } 1713 } 1714 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_gt_8_subtile)1715 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_gt_8_subtile) { 1716 TEST_REQUIRES_ARM_NEON; 1717 for (size_t k = 9; k < 16; k++) { 1718 for (uint32_t n = 1; n <= 8; n++) { 1719 for (uint32_t m = 1; m <= 4; m++) { 1720 GemmMicrokernelTester() 1721 .mr(4) 1722 .nr(8) 1723 .kr(1) 1724 .sr(1) 1725 .m(m) 1726 .n(n) 1727 .k(k) 1728 .iterations(1) 1729 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1730 } 1731 } 1732 } 1733 } 1734 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_div_8)1735 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_div_8) { 1736 TEST_REQUIRES_ARM_NEON; 1737 for (size_t k = 16; k <= 80; k += 8) { 1738 GemmMicrokernelTester() 1739 .mr(4) 1740 .nr(8) 1741 .kr(1) 1742 .sr(1) 1743 .m(4) 1744 .n(8) 1745 .k(k) 1746 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1747 } 1748 } 1749 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_div_8_strided_a)1750 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_div_8_strided_a) { 1751 TEST_REQUIRES_ARM_NEON; 1752 for (size_t k = 16; k <= 80; k += 8) { 1753 GemmMicrokernelTester() 1754 .mr(4) 1755 .nr(8) 1756 .kr(1) 1757 .sr(1) 1758 .m(4) 1759 .n(8) 1760 .k(k) 1761 .a_stride(83) 1762 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1763 } 1764 } 1765 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,k_div_8_subtile)1766 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, k_div_8_subtile) { 1767 TEST_REQUIRES_ARM_NEON; 1768 for (size_t k = 16; k <= 80; k += 8) { 1769 for (uint32_t n = 1; n <= 8; n++) { 1770 for (uint32_t m = 1; m <= 4; m++) { 1771 GemmMicrokernelTester() 1772 .mr(4) 1773 .nr(8) 1774 .kr(1) 1775 .sr(1) 1776 .m(m) 1777 .n(n) 1778 .k(k) 1779 .iterations(1) 1780 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1781 } 1782 } 1783 } 1784 } 1785 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,n_gt_8)1786 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_gt_8) { 1787 TEST_REQUIRES_ARM_NEON; 1788 for (uint32_t n = 9; n < 16; n++) { 1789 for (size_t k = 1; k <= 40; k += 9) { 1790 GemmMicrokernelTester() 1791 .mr(4) 1792 .nr(8) 1793 .kr(1) 1794 .sr(1) 1795 .m(4) 1796 .n(n) 1797 .k(k) 1798 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1799 } 1800 } 1801 } 1802 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,n_gt_8_strided_cn)1803 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_gt_8_strided_cn) { 1804 TEST_REQUIRES_ARM_NEON; 1805 for (uint32_t n = 9; n < 16; n++) { 1806 for (size_t k = 1; k <= 40; k += 9) { 1807 GemmMicrokernelTester() 1808 .mr(4) 1809 .nr(8) 1810 .kr(1) 1811 .sr(1) 1812 .m(4) 1813 .n(n) 1814 .k(k) 1815 .cn_stride(11) 1816 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1817 } 1818 } 1819 } 1820 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,n_gt_8_strided_a)1821 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_gt_8_strided_a) { 1822 TEST_REQUIRES_ARM_NEON; 1823 for (uint32_t n = 9; n < 16; n++) { 1824 for (size_t k = 1; k <= 40; k += 9) { 1825 GemmMicrokernelTester() 1826 .mr(4) 1827 .nr(8) 1828 .kr(1) 1829 .sr(1) 1830 .m(4) 1831 .n(n) 1832 .k(k) 1833 .a_stride(43) 1834 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1835 } 1836 } 1837 } 1838 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,n_gt_8_subtile)1839 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_gt_8_subtile) { 1840 TEST_REQUIRES_ARM_NEON; 1841 for (uint32_t n = 9; n < 16; n++) { 1842 for (size_t k = 1; k <= 40; k += 9) { 1843 for (uint32_t m = 1; m <= 4; m++) { 1844 GemmMicrokernelTester() 1845 .mr(4) 1846 .nr(8) 1847 .kr(1) 1848 .sr(1) 1849 .m(m) 1850 .n(n) 1851 .k(k) 1852 .iterations(1) 1853 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1854 } 1855 } 1856 } 1857 } 1858 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,n_div_8)1859 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_div_8) { 1860 TEST_REQUIRES_ARM_NEON; 1861 for (uint32_t n = 16; n <= 24; n += 8) { 1862 for (size_t k = 1; k <= 40; k += 9) { 1863 GemmMicrokernelTester() 1864 .mr(4) 1865 .nr(8) 1866 .kr(1) 1867 .sr(1) 1868 .m(4) 1869 .n(n) 1870 .k(k) 1871 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1872 } 1873 } 1874 } 1875 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,n_div_8_strided_cn)1876 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_div_8_strided_cn) { 1877 TEST_REQUIRES_ARM_NEON; 1878 for (uint32_t n = 16; n <= 24; n += 8) { 1879 for (size_t k = 1; k <= 40; k += 9) { 1880 GemmMicrokernelTester() 1881 .mr(4) 1882 .nr(8) 1883 .kr(1) 1884 .sr(1) 1885 .m(4) 1886 .n(n) 1887 .k(k) 1888 .cn_stride(11) 1889 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1890 } 1891 } 1892 } 1893 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,n_div_8_strided_a)1894 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_div_8_strided_a) { 1895 TEST_REQUIRES_ARM_NEON; 1896 for (uint32_t n = 16; n <= 24; n += 8) { 1897 for (size_t k = 1; k <= 40; k += 9) { 1898 GemmMicrokernelTester() 1899 .mr(4) 1900 .nr(8) 1901 .kr(1) 1902 .sr(1) 1903 .m(4) 1904 .n(n) 1905 .k(k) 1906 .a_stride(43) 1907 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1908 } 1909 } 1910 } 1911 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,n_div_8_subtile)1912 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, n_div_8_subtile) { 1913 TEST_REQUIRES_ARM_NEON; 1914 for (uint32_t n = 16; n <= 24; n += 8) { 1915 for (size_t k = 1; k <= 40; k += 9) { 1916 for (uint32_t m = 1; m <= 4; m++) { 1917 GemmMicrokernelTester() 1918 .mr(4) 1919 .nr(8) 1920 .kr(1) 1921 .sr(1) 1922 .m(m) 1923 .n(n) 1924 .k(k) 1925 .iterations(1) 1926 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1927 } 1928 } 1929 } 1930 } 1931 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,strided_cm_subtile)1932 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, strided_cm_subtile) { 1933 TEST_REQUIRES_ARM_NEON; 1934 for (size_t k = 1; k <= 40; k += 9) { 1935 for (uint32_t n = 1; n <= 8; n++) { 1936 for (uint32_t m = 1; m <= 4; m++) { 1937 GemmMicrokernelTester() 1938 .mr(4) 1939 .nr(8) 1940 .kr(1) 1941 .sr(1) 1942 .m(m) 1943 .n(n) 1944 .k(k) 1945 .cm_stride(11) 1946 .iterations(1) 1947 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1948 } 1949 } 1950 } 1951 } 1952 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,qmin)1953 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, qmin) { 1954 TEST_REQUIRES_ARM_NEON; 1955 GemmMicrokernelTester() 1956 .mr(4) 1957 .nr(8) 1958 .kr(1) 1959 .sr(1) 1960 .m(4) 1961 .n(8) 1962 .k(8) 1963 .qmin(128) 1964 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1965 } 1966 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,qmax)1967 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, qmax) { 1968 TEST_REQUIRES_ARM_NEON; 1969 GemmMicrokernelTester() 1970 .mr(4) 1971 .nr(8) 1972 .kr(1) 1973 .sr(1) 1974 .m(4) 1975 .n(8) 1976 .k(8) 1977 .qmax(128) 1978 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1979 } 1980 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,strided_cm)1981 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, strided_cm) { 1982 TEST_REQUIRES_ARM_NEON; 1983 GemmMicrokernelTester() 1984 .mr(4) 1985 .nr(8) 1986 .kr(1) 1987 .sr(1) 1988 .m(4) 1989 .n(8) 1990 .k(8) 1991 .cm_stride(11) 1992 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 1993 } 1994 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,no_a_zero_point)1995 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, no_a_zero_point) { 1996 TEST_REQUIRES_ARM_NEON; 1997 for (size_t k = 1; k <= 40; k += 9) { 1998 GemmMicrokernelTester() 1999 .mr(4) 2000 .nr(8) 2001 .kr(1) 2002 .sr(1) 2003 .m(4) 2004 .n(8) 2005 .k(k) 2006 .a_zero_point(0) 2007 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2008 } 2009 } 2010 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,no_b_zero_point)2011 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, no_b_zero_point) { 2012 TEST_REQUIRES_ARM_NEON; 2013 for (size_t k = 1; k <= 40; k += 9) { 2014 GemmMicrokernelTester() 2015 .mr(4) 2016 .nr(8) 2017 .kr(1) 2018 .sr(1) 2019 .m(4) 2020 .n(8) 2021 .k(k) 2022 .b_zero_point(0) 2023 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2024 } 2025 } 2026 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE,no_zero_point)2027 TEST(QU8_GEMM_MINMAX_RNDNU_4X8__NEON_MLAL_LANE, no_zero_point) { 2028 TEST_REQUIRES_ARM_NEON; 2029 for (size_t k = 1; k <= 40; k += 9) { 2030 GemmMicrokernelTester() 2031 .mr(4) 2032 .nr(8) 2033 .kr(1) 2034 .sr(1) 2035 .m(4) 2036 .n(8) 2037 .k(k) 2038 .a_zero_point(0) 2039 .b_zero_point(0) 2040 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2041 } 2042 } 2043 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 2044 2045 2046 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_eq_8)2047 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_eq_8) { 2048 TEST_REQUIRES_ARM_NEON; 2049 GemmMicrokernelTester() 2050 .mr(4) 2051 .nr(16) 2052 .kr(1) 2053 .sr(1) 2054 .m(4) 2055 .n(16) 2056 .k(8) 2057 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2058 } 2059 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,strided_cn)2060 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, strided_cn) { 2061 TEST_REQUIRES_ARM_NEON; 2062 GemmMicrokernelTester() 2063 .mr(4) 2064 .nr(16) 2065 .kr(1) 2066 .sr(1) 2067 .m(4) 2068 .n(16) 2069 .k(8) 2070 .cn_stride(19) 2071 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2072 } 2073 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_eq_8_strided_a)2074 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_eq_8_strided_a) { 2075 TEST_REQUIRES_ARM_NEON; 2076 GemmMicrokernelTester() 2077 .mr(4) 2078 .nr(16) 2079 .kr(1) 2080 .sr(1) 2081 .m(4) 2082 .n(16) 2083 .k(8) 2084 .a_stride(11) 2085 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2086 } 2087 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_eq_8_subtile)2088 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_eq_8_subtile) { 2089 TEST_REQUIRES_ARM_NEON; 2090 for (uint32_t n = 1; n <= 16; n++) { 2091 for (uint32_t m = 1; m <= 4; m++) { 2092 GemmMicrokernelTester() 2093 .mr(4) 2094 .nr(16) 2095 .kr(1) 2096 .sr(1) 2097 .m(m) 2098 .n(n) 2099 .k(8) 2100 .iterations(1) 2101 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2102 } 2103 } 2104 } 2105 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_eq_8_subtile_m)2106 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_eq_8_subtile_m) { 2107 TEST_REQUIRES_ARM_NEON; 2108 for (uint32_t m = 1; m <= 4; m++) { 2109 GemmMicrokernelTester() 2110 .mr(4) 2111 .nr(16) 2112 .kr(1) 2113 .sr(1) 2114 .m(m) 2115 .n(16) 2116 .k(8) 2117 .iterations(1) 2118 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2119 } 2120 } 2121 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_eq_8_subtile_n)2122 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_eq_8_subtile_n) { 2123 TEST_REQUIRES_ARM_NEON; 2124 for (uint32_t n = 1; n <= 16; n++) { 2125 GemmMicrokernelTester() 2126 .mr(4) 2127 .nr(16) 2128 .kr(1) 2129 .sr(1) 2130 .m(4) 2131 .n(n) 2132 .k(8) 2133 .iterations(1) 2134 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2135 } 2136 } 2137 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_lt_8)2138 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_lt_8) { 2139 TEST_REQUIRES_ARM_NEON; 2140 for (size_t k = 1; k < 8; k++) { 2141 GemmMicrokernelTester() 2142 .mr(4) 2143 .nr(16) 2144 .kr(1) 2145 .sr(1) 2146 .m(4) 2147 .n(16) 2148 .k(k) 2149 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2150 } 2151 } 2152 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_lt_8_strided_a)2153 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_lt_8_strided_a) { 2154 TEST_REQUIRES_ARM_NEON; 2155 for (size_t k = 1; k < 8; k++) { 2156 GemmMicrokernelTester() 2157 .mr(4) 2158 .nr(16) 2159 .kr(1) 2160 .sr(1) 2161 .m(4) 2162 .n(16) 2163 .k(k) 2164 .a_stride(11) 2165 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2166 } 2167 } 2168 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_lt_8_subtile)2169 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_lt_8_subtile) { 2170 TEST_REQUIRES_ARM_NEON; 2171 for (size_t k = 1; k < 8; k++) { 2172 for (uint32_t n = 1; n <= 16; n++) { 2173 for (uint32_t m = 1; m <= 4; m++) { 2174 GemmMicrokernelTester() 2175 .mr(4) 2176 .nr(16) 2177 .kr(1) 2178 .sr(1) 2179 .m(m) 2180 .n(n) 2181 .k(k) 2182 .iterations(1) 2183 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2184 } 2185 } 2186 } 2187 } 2188 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_gt_8)2189 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_gt_8) { 2190 TEST_REQUIRES_ARM_NEON; 2191 for (size_t k = 9; k < 16; k++) { 2192 GemmMicrokernelTester() 2193 .mr(4) 2194 .nr(16) 2195 .kr(1) 2196 .sr(1) 2197 .m(4) 2198 .n(16) 2199 .k(k) 2200 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2201 } 2202 } 2203 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_gt_8_strided_a)2204 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_gt_8_strided_a) { 2205 TEST_REQUIRES_ARM_NEON; 2206 for (size_t k = 9; k < 16; k++) { 2207 GemmMicrokernelTester() 2208 .mr(4) 2209 .nr(16) 2210 .kr(1) 2211 .sr(1) 2212 .m(4) 2213 .n(16) 2214 .k(k) 2215 .a_stride(19) 2216 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2217 } 2218 } 2219 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_gt_8_subtile)2220 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_gt_8_subtile) { 2221 TEST_REQUIRES_ARM_NEON; 2222 for (size_t k = 9; k < 16; k++) { 2223 for (uint32_t n = 1; n <= 16; n++) { 2224 for (uint32_t m = 1; m <= 4; m++) { 2225 GemmMicrokernelTester() 2226 .mr(4) 2227 .nr(16) 2228 .kr(1) 2229 .sr(1) 2230 .m(m) 2231 .n(n) 2232 .k(k) 2233 .iterations(1) 2234 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2235 } 2236 } 2237 } 2238 } 2239 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_div_8)2240 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_div_8) { 2241 TEST_REQUIRES_ARM_NEON; 2242 for (size_t k = 16; k <= 80; k += 8) { 2243 GemmMicrokernelTester() 2244 .mr(4) 2245 .nr(16) 2246 .kr(1) 2247 .sr(1) 2248 .m(4) 2249 .n(16) 2250 .k(k) 2251 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2252 } 2253 } 2254 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_div_8_strided_a)2255 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_div_8_strided_a) { 2256 TEST_REQUIRES_ARM_NEON; 2257 for (size_t k = 16; k <= 80; k += 8) { 2258 GemmMicrokernelTester() 2259 .mr(4) 2260 .nr(16) 2261 .kr(1) 2262 .sr(1) 2263 .m(4) 2264 .n(16) 2265 .k(k) 2266 .a_stride(83) 2267 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2268 } 2269 } 2270 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,k_div_8_subtile)2271 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, k_div_8_subtile) { 2272 TEST_REQUIRES_ARM_NEON; 2273 for (size_t k = 16; k <= 80; k += 8) { 2274 for (uint32_t n = 1; n <= 16; n++) { 2275 for (uint32_t m = 1; m <= 4; m++) { 2276 GemmMicrokernelTester() 2277 .mr(4) 2278 .nr(16) 2279 .kr(1) 2280 .sr(1) 2281 .m(m) 2282 .n(n) 2283 .k(k) 2284 .iterations(1) 2285 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2286 } 2287 } 2288 } 2289 } 2290 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,n_gt_16)2291 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, n_gt_16) { 2292 TEST_REQUIRES_ARM_NEON; 2293 for (uint32_t n = 17; n < 32; n++) { 2294 for (size_t k = 1; k <= 40; k += 9) { 2295 GemmMicrokernelTester() 2296 .mr(4) 2297 .nr(16) 2298 .kr(1) 2299 .sr(1) 2300 .m(4) 2301 .n(n) 2302 .k(k) 2303 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2304 } 2305 } 2306 } 2307 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,n_gt_16_strided_cn)2308 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, n_gt_16_strided_cn) { 2309 TEST_REQUIRES_ARM_NEON; 2310 for (uint32_t n = 17; n < 32; n++) { 2311 for (size_t k = 1; k <= 40; k += 9) { 2312 GemmMicrokernelTester() 2313 .mr(4) 2314 .nr(16) 2315 .kr(1) 2316 .sr(1) 2317 .m(4) 2318 .n(n) 2319 .k(k) 2320 .cn_stride(19) 2321 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2322 } 2323 } 2324 } 2325 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,n_gt_16_strided_a)2326 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, n_gt_16_strided_a) { 2327 TEST_REQUIRES_ARM_NEON; 2328 for (uint32_t n = 17; n < 32; n++) { 2329 for (size_t k = 1; k <= 40; k += 9) { 2330 GemmMicrokernelTester() 2331 .mr(4) 2332 .nr(16) 2333 .kr(1) 2334 .sr(1) 2335 .m(4) 2336 .n(n) 2337 .k(k) 2338 .a_stride(43) 2339 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2340 } 2341 } 2342 } 2343 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,n_gt_16_subtile)2344 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, n_gt_16_subtile) { 2345 TEST_REQUIRES_ARM_NEON; 2346 for (uint32_t n = 17; n < 32; n++) { 2347 for (size_t k = 1; k <= 40; k += 9) { 2348 for (uint32_t m = 1; m <= 4; m++) { 2349 GemmMicrokernelTester() 2350 .mr(4) 2351 .nr(16) 2352 .kr(1) 2353 .sr(1) 2354 .m(m) 2355 .n(n) 2356 .k(k) 2357 .iterations(1) 2358 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2359 } 2360 } 2361 } 2362 } 2363 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,n_div_16)2364 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, n_div_16) { 2365 TEST_REQUIRES_ARM_NEON; 2366 for (uint32_t n = 32; n <= 48; n += 16) { 2367 for (size_t k = 1; k <= 40; k += 9) { 2368 GemmMicrokernelTester() 2369 .mr(4) 2370 .nr(16) 2371 .kr(1) 2372 .sr(1) 2373 .m(4) 2374 .n(n) 2375 .k(k) 2376 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2377 } 2378 } 2379 } 2380 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,n_div_16_strided_cn)2381 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, n_div_16_strided_cn) { 2382 TEST_REQUIRES_ARM_NEON; 2383 for (uint32_t n = 32; n <= 48; n += 16) { 2384 for (size_t k = 1; k <= 40; k += 9) { 2385 GemmMicrokernelTester() 2386 .mr(4) 2387 .nr(16) 2388 .kr(1) 2389 .sr(1) 2390 .m(4) 2391 .n(n) 2392 .k(k) 2393 .cn_stride(19) 2394 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2395 } 2396 } 2397 } 2398 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,n_div_16_strided_a)2399 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, n_div_16_strided_a) { 2400 TEST_REQUIRES_ARM_NEON; 2401 for (uint32_t n = 32; n <= 48; n += 16) { 2402 for (size_t k = 1; k <= 40; k += 9) { 2403 GemmMicrokernelTester() 2404 .mr(4) 2405 .nr(16) 2406 .kr(1) 2407 .sr(1) 2408 .m(4) 2409 .n(n) 2410 .k(k) 2411 .a_stride(43) 2412 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2413 } 2414 } 2415 } 2416 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,n_div_16_subtile)2417 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, n_div_16_subtile) { 2418 TEST_REQUIRES_ARM_NEON; 2419 for (uint32_t n = 32; n <= 48; n += 16) { 2420 for (size_t k = 1; k <= 40; k += 9) { 2421 for (uint32_t m = 1; m <= 4; m++) { 2422 GemmMicrokernelTester() 2423 .mr(4) 2424 .nr(16) 2425 .kr(1) 2426 .sr(1) 2427 .m(m) 2428 .n(n) 2429 .k(k) 2430 .iterations(1) 2431 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2432 } 2433 } 2434 } 2435 } 2436 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,strided_cm_subtile)2437 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, strided_cm_subtile) { 2438 TEST_REQUIRES_ARM_NEON; 2439 for (size_t k = 1; k <= 40; k += 9) { 2440 for (uint32_t n = 1; n <= 16; n++) { 2441 for (uint32_t m = 1; m <= 4; m++) { 2442 GemmMicrokernelTester() 2443 .mr(4) 2444 .nr(16) 2445 .kr(1) 2446 .sr(1) 2447 .m(m) 2448 .n(n) 2449 .k(k) 2450 .cm_stride(19) 2451 .iterations(1) 2452 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2453 } 2454 } 2455 } 2456 } 2457 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,qmin)2458 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, qmin) { 2459 TEST_REQUIRES_ARM_NEON; 2460 GemmMicrokernelTester() 2461 .mr(4) 2462 .nr(16) 2463 .kr(1) 2464 .sr(1) 2465 .m(4) 2466 .n(16) 2467 .k(8) 2468 .qmin(128) 2469 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2470 } 2471 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,qmax)2472 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, qmax) { 2473 TEST_REQUIRES_ARM_NEON; 2474 GemmMicrokernelTester() 2475 .mr(4) 2476 .nr(16) 2477 .kr(1) 2478 .sr(1) 2479 .m(4) 2480 .n(16) 2481 .k(8) 2482 .qmax(128) 2483 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2484 } 2485 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,strided_cm)2486 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, strided_cm) { 2487 TEST_REQUIRES_ARM_NEON; 2488 GemmMicrokernelTester() 2489 .mr(4) 2490 .nr(16) 2491 .kr(1) 2492 .sr(1) 2493 .m(4) 2494 .n(16) 2495 .k(8) 2496 .cm_stride(19) 2497 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2498 } 2499 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,no_a_zero_point)2500 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, no_a_zero_point) { 2501 TEST_REQUIRES_ARM_NEON; 2502 for (size_t k = 1; k <= 40; k += 9) { 2503 GemmMicrokernelTester() 2504 .mr(4) 2505 .nr(16) 2506 .kr(1) 2507 .sr(1) 2508 .m(4) 2509 .n(16) 2510 .k(k) 2511 .a_zero_point(0) 2512 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2513 } 2514 } 2515 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,no_b_zero_point)2516 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, no_b_zero_point) { 2517 TEST_REQUIRES_ARM_NEON; 2518 for (size_t k = 1; k <= 40; k += 9) { 2519 GemmMicrokernelTester() 2520 .mr(4) 2521 .nr(16) 2522 .kr(1) 2523 .sr(1) 2524 .m(4) 2525 .n(16) 2526 .k(k) 2527 .b_zero_point(0) 2528 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2529 } 2530 } 2531 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE,no_zero_point)2532 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__NEON_MLAL_LANE, no_zero_point) { 2533 TEST_REQUIRES_ARM_NEON; 2534 for (size_t k = 1; k <= 40; k += 9) { 2535 GemmMicrokernelTester() 2536 .mr(4) 2537 .nr(16) 2538 .kr(1) 2539 .sr(1) 2540 .m(4) 2541 .n(16) 2542 .k(k) 2543 .a_zero_point(0) 2544 .b_zero_point(0) 2545 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2546 } 2547 } 2548 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 2549 2550 2551 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_eq_8)2552 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8) { 2553 TEST_REQUIRES_ARM_NEON; 2554 GemmMicrokernelTester() 2555 .mr(6) 2556 .nr(16) 2557 .kr(1) 2558 .sr(1) 2559 .m(6) 2560 .n(16) 2561 .k(8) 2562 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2563 } 2564 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,strided_cn)2565 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cn) { 2566 TEST_REQUIRES_ARM_NEON; 2567 GemmMicrokernelTester() 2568 .mr(6) 2569 .nr(16) 2570 .kr(1) 2571 .sr(1) 2572 .m(6) 2573 .n(16) 2574 .k(8) 2575 .cn_stride(19) 2576 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2577 } 2578 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_eq_8_strided_a)2579 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_strided_a) { 2580 TEST_REQUIRES_ARM_NEON; 2581 GemmMicrokernelTester() 2582 .mr(6) 2583 .nr(16) 2584 .kr(1) 2585 .sr(1) 2586 .m(6) 2587 .n(16) 2588 .k(8) 2589 .a_stride(11) 2590 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2591 } 2592 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_eq_8_subtile)2593 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile) { 2594 TEST_REQUIRES_ARM_NEON; 2595 for (uint32_t n = 1; n <= 16; n++) { 2596 for (uint32_t m = 1; m <= 6; m++) { 2597 GemmMicrokernelTester() 2598 .mr(6) 2599 .nr(16) 2600 .kr(1) 2601 .sr(1) 2602 .m(m) 2603 .n(n) 2604 .k(8) 2605 .iterations(1) 2606 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2607 } 2608 } 2609 } 2610 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_eq_8_subtile_m)2611 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile_m) { 2612 TEST_REQUIRES_ARM_NEON; 2613 for (uint32_t m = 1; m <= 6; m++) { 2614 GemmMicrokernelTester() 2615 .mr(6) 2616 .nr(16) 2617 .kr(1) 2618 .sr(1) 2619 .m(m) 2620 .n(16) 2621 .k(8) 2622 .iterations(1) 2623 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2624 } 2625 } 2626 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_eq_8_subtile_n)2627 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_eq_8_subtile_n) { 2628 TEST_REQUIRES_ARM_NEON; 2629 for (uint32_t n = 1; n <= 16; n++) { 2630 GemmMicrokernelTester() 2631 .mr(6) 2632 .nr(16) 2633 .kr(1) 2634 .sr(1) 2635 .m(6) 2636 .n(n) 2637 .k(8) 2638 .iterations(1) 2639 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2640 } 2641 } 2642 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_lt_8)2643 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_lt_8) { 2644 TEST_REQUIRES_ARM_NEON; 2645 for (size_t k = 1; k < 8; k++) { 2646 GemmMicrokernelTester() 2647 .mr(6) 2648 .nr(16) 2649 .kr(1) 2650 .sr(1) 2651 .m(6) 2652 .n(16) 2653 .k(k) 2654 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2655 } 2656 } 2657 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_lt_8_strided_a)2658 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_lt_8_strided_a) { 2659 TEST_REQUIRES_ARM_NEON; 2660 for (size_t k = 1; k < 8; k++) { 2661 GemmMicrokernelTester() 2662 .mr(6) 2663 .nr(16) 2664 .kr(1) 2665 .sr(1) 2666 .m(6) 2667 .n(16) 2668 .k(k) 2669 .a_stride(11) 2670 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2671 } 2672 } 2673 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_lt_8_subtile)2674 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_lt_8_subtile) { 2675 TEST_REQUIRES_ARM_NEON; 2676 for (size_t k = 1; k < 8; k++) { 2677 for (uint32_t n = 1; n <= 16; n++) { 2678 for (uint32_t m = 1; m <= 6; m++) { 2679 GemmMicrokernelTester() 2680 .mr(6) 2681 .nr(16) 2682 .kr(1) 2683 .sr(1) 2684 .m(m) 2685 .n(n) 2686 .k(k) 2687 .iterations(1) 2688 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2689 } 2690 } 2691 } 2692 } 2693 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_gt_8)2694 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_gt_8) { 2695 TEST_REQUIRES_ARM_NEON; 2696 for (size_t k = 9; k < 16; k++) { 2697 GemmMicrokernelTester() 2698 .mr(6) 2699 .nr(16) 2700 .kr(1) 2701 .sr(1) 2702 .m(6) 2703 .n(16) 2704 .k(k) 2705 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2706 } 2707 } 2708 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_gt_8_strided_a)2709 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_gt_8_strided_a) { 2710 TEST_REQUIRES_ARM_NEON; 2711 for (size_t k = 9; k < 16; k++) { 2712 GemmMicrokernelTester() 2713 .mr(6) 2714 .nr(16) 2715 .kr(1) 2716 .sr(1) 2717 .m(6) 2718 .n(16) 2719 .k(k) 2720 .a_stride(19) 2721 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2722 } 2723 } 2724 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_gt_8_subtile)2725 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_gt_8_subtile) { 2726 TEST_REQUIRES_ARM_NEON; 2727 for (size_t k = 9; k < 16; k++) { 2728 for (uint32_t n = 1; n <= 16; n++) { 2729 for (uint32_t m = 1; m <= 6; m++) { 2730 GemmMicrokernelTester() 2731 .mr(6) 2732 .nr(16) 2733 .kr(1) 2734 .sr(1) 2735 .m(m) 2736 .n(n) 2737 .k(k) 2738 .iterations(1) 2739 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2740 } 2741 } 2742 } 2743 } 2744 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_div_8)2745 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_div_8) { 2746 TEST_REQUIRES_ARM_NEON; 2747 for (size_t k = 16; k <= 80; k += 8) { 2748 GemmMicrokernelTester() 2749 .mr(6) 2750 .nr(16) 2751 .kr(1) 2752 .sr(1) 2753 .m(6) 2754 .n(16) 2755 .k(k) 2756 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2757 } 2758 } 2759 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_div_8_strided_a)2760 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_div_8_strided_a) { 2761 TEST_REQUIRES_ARM_NEON; 2762 for (size_t k = 16; k <= 80; k += 8) { 2763 GemmMicrokernelTester() 2764 .mr(6) 2765 .nr(16) 2766 .kr(1) 2767 .sr(1) 2768 .m(6) 2769 .n(16) 2770 .k(k) 2771 .a_stride(83) 2772 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2773 } 2774 } 2775 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,k_div_8_subtile)2776 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, k_div_8_subtile) { 2777 TEST_REQUIRES_ARM_NEON; 2778 for (size_t k = 16; k <= 80; k += 8) { 2779 for (uint32_t n = 1; n <= 16; n++) { 2780 for (uint32_t m = 1; m <= 6; m++) { 2781 GemmMicrokernelTester() 2782 .mr(6) 2783 .nr(16) 2784 .kr(1) 2785 .sr(1) 2786 .m(m) 2787 .n(n) 2788 .k(k) 2789 .iterations(1) 2790 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2791 } 2792 } 2793 } 2794 } 2795 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,n_gt_16)2796 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16) { 2797 TEST_REQUIRES_ARM_NEON; 2798 for (uint32_t n = 17; n < 32; n++) { 2799 for (size_t k = 1; k <= 40; k += 9) { 2800 GemmMicrokernelTester() 2801 .mr(6) 2802 .nr(16) 2803 .kr(1) 2804 .sr(1) 2805 .m(6) 2806 .n(n) 2807 .k(k) 2808 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2809 } 2810 } 2811 } 2812 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,n_gt_16_strided_cn)2813 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_strided_cn) { 2814 TEST_REQUIRES_ARM_NEON; 2815 for (uint32_t n = 17; n < 32; n++) { 2816 for (size_t k = 1; k <= 40; k += 9) { 2817 GemmMicrokernelTester() 2818 .mr(6) 2819 .nr(16) 2820 .kr(1) 2821 .sr(1) 2822 .m(6) 2823 .n(n) 2824 .k(k) 2825 .cn_stride(19) 2826 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2827 } 2828 } 2829 } 2830 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,n_gt_16_strided_a)2831 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_strided_a) { 2832 TEST_REQUIRES_ARM_NEON; 2833 for (uint32_t n = 17; n < 32; n++) { 2834 for (size_t k = 1; k <= 40; k += 9) { 2835 GemmMicrokernelTester() 2836 .mr(6) 2837 .nr(16) 2838 .kr(1) 2839 .sr(1) 2840 .m(6) 2841 .n(n) 2842 .k(k) 2843 .a_stride(43) 2844 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2845 } 2846 } 2847 } 2848 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,n_gt_16_subtile)2849 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_gt_16_subtile) { 2850 TEST_REQUIRES_ARM_NEON; 2851 for (uint32_t n = 17; n < 32; n++) { 2852 for (size_t k = 1; k <= 40; k += 9) { 2853 for (uint32_t m = 1; m <= 6; m++) { 2854 GemmMicrokernelTester() 2855 .mr(6) 2856 .nr(16) 2857 .kr(1) 2858 .sr(1) 2859 .m(m) 2860 .n(n) 2861 .k(k) 2862 .iterations(1) 2863 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2864 } 2865 } 2866 } 2867 } 2868 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,n_div_16)2869 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16) { 2870 TEST_REQUIRES_ARM_NEON; 2871 for (uint32_t n = 32; n <= 48; n += 16) { 2872 for (size_t k = 1; k <= 40; k += 9) { 2873 GemmMicrokernelTester() 2874 .mr(6) 2875 .nr(16) 2876 .kr(1) 2877 .sr(1) 2878 .m(6) 2879 .n(n) 2880 .k(k) 2881 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2882 } 2883 } 2884 } 2885 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,n_div_16_strided_cn)2886 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_strided_cn) { 2887 TEST_REQUIRES_ARM_NEON; 2888 for (uint32_t n = 32; n <= 48; n += 16) { 2889 for (size_t k = 1; k <= 40; k += 9) { 2890 GemmMicrokernelTester() 2891 .mr(6) 2892 .nr(16) 2893 .kr(1) 2894 .sr(1) 2895 .m(6) 2896 .n(n) 2897 .k(k) 2898 .cn_stride(19) 2899 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2900 } 2901 } 2902 } 2903 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,n_div_16_strided_a)2904 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_strided_a) { 2905 TEST_REQUIRES_ARM_NEON; 2906 for (uint32_t n = 32; n <= 48; n += 16) { 2907 for (size_t k = 1; k <= 40; k += 9) { 2908 GemmMicrokernelTester() 2909 .mr(6) 2910 .nr(16) 2911 .kr(1) 2912 .sr(1) 2913 .m(6) 2914 .n(n) 2915 .k(k) 2916 .a_stride(43) 2917 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2918 } 2919 } 2920 } 2921 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,n_div_16_subtile)2922 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, n_div_16_subtile) { 2923 TEST_REQUIRES_ARM_NEON; 2924 for (uint32_t n = 32; n <= 48; n += 16) { 2925 for (size_t k = 1; k <= 40; k += 9) { 2926 for (uint32_t m = 1; m <= 6; m++) { 2927 GemmMicrokernelTester() 2928 .mr(6) 2929 .nr(16) 2930 .kr(1) 2931 .sr(1) 2932 .m(m) 2933 .n(n) 2934 .k(k) 2935 .iterations(1) 2936 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2937 } 2938 } 2939 } 2940 } 2941 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,strided_cm_subtile)2942 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cm_subtile) { 2943 TEST_REQUIRES_ARM_NEON; 2944 for (size_t k = 1; k <= 40; k += 9) { 2945 for (uint32_t n = 1; n <= 16; n++) { 2946 for (uint32_t m = 1; m <= 6; m++) { 2947 GemmMicrokernelTester() 2948 .mr(6) 2949 .nr(16) 2950 .kr(1) 2951 .sr(1) 2952 .m(m) 2953 .n(n) 2954 .k(k) 2955 .cm_stride(19) 2956 .iterations(1) 2957 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2958 } 2959 } 2960 } 2961 } 2962 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,qmin)2963 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, qmin) { 2964 TEST_REQUIRES_ARM_NEON; 2965 GemmMicrokernelTester() 2966 .mr(6) 2967 .nr(16) 2968 .kr(1) 2969 .sr(1) 2970 .m(6) 2971 .n(16) 2972 .k(8) 2973 .qmin(128) 2974 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2975 } 2976 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,qmax)2977 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, qmax) { 2978 TEST_REQUIRES_ARM_NEON; 2979 GemmMicrokernelTester() 2980 .mr(6) 2981 .nr(16) 2982 .kr(1) 2983 .sr(1) 2984 .m(6) 2985 .n(16) 2986 .k(8) 2987 .qmax(128) 2988 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 2989 } 2990 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,strided_cm)2991 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, strided_cm) { 2992 TEST_REQUIRES_ARM_NEON; 2993 GemmMicrokernelTester() 2994 .mr(6) 2995 .nr(16) 2996 .kr(1) 2997 .sr(1) 2998 .m(6) 2999 .n(16) 3000 .k(8) 3001 .cm_stride(19) 3002 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3003 } 3004 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,no_a_zero_point)3005 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, no_a_zero_point) { 3006 TEST_REQUIRES_ARM_NEON; 3007 for (size_t k = 1; k <= 40; k += 9) { 3008 GemmMicrokernelTester() 3009 .mr(6) 3010 .nr(16) 3011 .kr(1) 3012 .sr(1) 3013 .m(6) 3014 .n(16) 3015 .k(k) 3016 .a_zero_point(0) 3017 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3018 } 3019 } 3020 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,no_b_zero_point)3021 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, no_b_zero_point) { 3022 TEST_REQUIRES_ARM_NEON; 3023 for (size_t k = 1; k <= 40; k += 9) { 3024 GemmMicrokernelTester() 3025 .mr(6) 3026 .nr(16) 3027 .kr(1) 3028 .sr(1) 3029 .m(6) 3030 .n(16) 3031 .k(k) 3032 .b_zero_point(0) 3033 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3034 } 3035 } 3036 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE,no_zero_point)3037 TEST(QU8_GEMM_MINMAX_RNDNU_6X16__NEON_MLAL_LANE, no_zero_point) { 3038 TEST_REQUIRES_ARM_NEON; 3039 for (size_t k = 1; k <= 40; k += 9) { 3040 GemmMicrokernelTester() 3041 .mr(6) 3042 .nr(16) 3043 .kr(1) 3044 .sr(1) 3045 .m(6) 3046 .n(16) 3047 .k(k) 3048 .a_zero_point(0) 3049 .b_zero_point(0) 3050 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3051 } 3052 } 3053 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 3054 3055 3056 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_8)3057 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_8) { 3058 TEST_REQUIRES_ARM_NEON_DOT; 3059 GemmMicrokernelTester() 3060 .mr(4) 3061 .nr(8) 3062 .kr(4) 3063 .sr(1) 3064 .m(4) 3065 .n(8) 3066 .k(8) 3067 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3068 } 3069 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,strided_cn)3070 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) { 3071 TEST_REQUIRES_ARM_NEON_DOT; 3072 GemmMicrokernelTester() 3073 .mr(4) 3074 .nr(8) 3075 .kr(4) 3076 .sr(1) 3077 .m(4) 3078 .n(8) 3079 .k(8) 3080 .cn_stride(11) 3081 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3082 } 3083 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_8_strided_a)3084 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_8_strided_a) { 3085 TEST_REQUIRES_ARM_NEON_DOT; 3086 GemmMicrokernelTester() 3087 .mr(4) 3088 .nr(8) 3089 .kr(4) 3090 .sr(1) 3091 .m(4) 3092 .n(8) 3093 .k(8) 3094 .a_stride(11) 3095 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3096 } 3097 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_8_subtile)3098 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_8_subtile) { 3099 TEST_REQUIRES_ARM_NEON_DOT; 3100 for (uint32_t n = 1; n <= 8; n++) { 3101 for (uint32_t m = 1; m <= 4; m++) { 3102 GemmMicrokernelTester() 3103 .mr(4) 3104 .nr(8) 3105 .kr(4) 3106 .sr(1) 3107 .m(m) 3108 .n(n) 3109 .k(8) 3110 .iterations(1) 3111 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3112 } 3113 } 3114 } 3115 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_8_subtile_m)3116 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_8_subtile_m) { 3117 TEST_REQUIRES_ARM_NEON_DOT; 3118 for (uint32_t m = 1; m <= 4; m++) { 3119 GemmMicrokernelTester() 3120 .mr(4) 3121 .nr(8) 3122 .kr(4) 3123 .sr(1) 3124 .m(m) 3125 .n(8) 3126 .k(8) 3127 .iterations(1) 3128 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3129 } 3130 } 3131 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_eq_8_subtile_n)3132 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_8_subtile_n) { 3133 TEST_REQUIRES_ARM_NEON_DOT; 3134 for (uint32_t n = 1; n <= 8; n++) { 3135 GemmMicrokernelTester() 3136 .mr(4) 3137 .nr(8) 3138 .kr(4) 3139 .sr(1) 3140 .m(4) 3141 .n(n) 3142 .k(8) 3143 .iterations(1) 3144 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3145 } 3146 } 3147 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_lt_8)3148 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_8) { 3149 TEST_REQUIRES_ARM_NEON_DOT; 3150 for (size_t k = 1; k < 8; k++) { 3151 GemmMicrokernelTester() 3152 .mr(4) 3153 .nr(8) 3154 .kr(4) 3155 .sr(1) 3156 .m(4) 3157 .n(8) 3158 .k(k) 3159 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3160 } 3161 } 3162 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_lt_8_strided_a)3163 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_8_strided_a) { 3164 TEST_REQUIRES_ARM_NEON_DOT; 3165 for (size_t k = 1; k < 8; k++) { 3166 GemmMicrokernelTester() 3167 .mr(4) 3168 .nr(8) 3169 .kr(4) 3170 .sr(1) 3171 .m(4) 3172 .n(8) 3173 .k(k) 3174 .a_stride(11) 3175 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3176 } 3177 } 3178 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_lt_8_subtile)3179 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_8_subtile) { 3180 TEST_REQUIRES_ARM_NEON_DOT; 3181 for (size_t k = 1; k < 8; k++) { 3182 for (uint32_t n = 1; n <= 8; n++) { 3183 for (uint32_t m = 1; m <= 4; m++) { 3184 GemmMicrokernelTester() 3185 .mr(4) 3186 .nr(8) 3187 .kr(4) 3188 .sr(1) 3189 .m(m) 3190 .n(n) 3191 .k(k) 3192 .iterations(1) 3193 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3194 } 3195 } 3196 } 3197 } 3198 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_gt_8)3199 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_8) { 3200 TEST_REQUIRES_ARM_NEON_DOT; 3201 for (size_t k = 9; k < 16; k++) { 3202 GemmMicrokernelTester() 3203 .mr(4) 3204 .nr(8) 3205 .kr(4) 3206 .sr(1) 3207 .m(4) 3208 .n(8) 3209 .k(k) 3210 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3211 } 3212 } 3213 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_gt_8_strided_a)3214 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_8_strided_a) { 3215 TEST_REQUIRES_ARM_NEON_DOT; 3216 for (size_t k = 9; k < 16; k++) { 3217 GemmMicrokernelTester() 3218 .mr(4) 3219 .nr(8) 3220 .kr(4) 3221 .sr(1) 3222 .m(4) 3223 .n(8) 3224 .k(k) 3225 .a_stride(19) 3226 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3227 } 3228 } 3229 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_gt_8_subtile)3230 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_8_subtile) { 3231 TEST_REQUIRES_ARM_NEON_DOT; 3232 for (size_t k = 9; k < 16; k++) { 3233 for (uint32_t n = 1; n <= 8; n++) { 3234 for (uint32_t m = 1; m <= 4; m++) { 3235 GemmMicrokernelTester() 3236 .mr(4) 3237 .nr(8) 3238 .kr(4) 3239 .sr(1) 3240 .m(m) 3241 .n(n) 3242 .k(k) 3243 .iterations(1) 3244 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3245 } 3246 } 3247 } 3248 } 3249 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_div_8)3250 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_div_8) { 3251 TEST_REQUIRES_ARM_NEON_DOT; 3252 for (size_t k = 16; k <= 80; k += 8) { 3253 GemmMicrokernelTester() 3254 .mr(4) 3255 .nr(8) 3256 .kr(4) 3257 .sr(1) 3258 .m(4) 3259 .n(8) 3260 .k(k) 3261 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3262 } 3263 } 3264 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_div_8_strided_a)3265 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_div_8_strided_a) { 3266 TEST_REQUIRES_ARM_NEON_DOT; 3267 for (size_t k = 16; k <= 80; k += 8) { 3268 GemmMicrokernelTester() 3269 .mr(4) 3270 .nr(8) 3271 .kr(4) 3272 .sr(1) 3273 .m(4) 3274 .n(8) 3275 .k(k) 3276 .a_stride(83) 3277 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3278 } 3279 } 3280 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,k_div_8_subtile)3281 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, k_div_8_subtile) { 3282 TEST_REQUIRES_ARM_NEON_DOT; 3283 for (size_t k = 16; k <= 80; k += 8) { 3284 for (uint32_t n = 1; n <= 8; n++) { 3285 for (uint32_t m = 1; m <= 4; m++) { 3286 GemmMicrokernelTester() 3287 .mr(4) 3288 .nr(8) 3289 .kr(4) 3290 .sr(1) 3291 .m(m) 3292 .n(n) 3293 .k(k) 3294 .iterations(1) 3295 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3296 } 3297 } 3298 } 3299 } 3300 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,n_gt_8)3301 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_8) { 3302 TEST_REQUIRES_ARM_NEON_DOT; 3303 for (uint32_t n = 9; n < 16; n++) { 3304 for (size_t k = 1; k <= 40; k += 9) { 3305 GemmMicrokernelTester() 3306 .mr(4) 3307 .nr(8) 3308 .kr(4) 3309 .sr(1) 3310 .m(4) 3311 .n(n) 3312 .k(k) 3313 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3314 } 3315 } 3316 } 3317 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,n_gt_8_strided_cn)3318 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_8_strided_cn) { 3319 TEST_REQUIRES_ARM_NEON_DOT; 3320 for (uint32_t n = 9; n < 16; n++) { 3321 for (size_t k = 1; k <= 40; k += 9) { 3322 GemmMicrokernelTester() 3323 .mr(4) 3324 .nr(8) 3325 .kr(4) 3326 .sr(1) 3327 .m(4) 3328 .n(n) 3329 .k(k) 3330 .cn_stride(11) 3331 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3332 } 3333 } 3334 } 3335 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,n_gt_8_strided_a)3336 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_8_strided_a) { 3337 TEST_REQUIRES_ARM_NEON_DOT; 3338 for (uint32_t n = 9; n < 16; n++) { 3339 for (size_t k = 1; k <= 40; k += 9) { 3340 GemmMicrokernelTester() 3341 .mr(4) 3342 .nr(8) 3343 .kr(4) 3344 .sr(1) 3345 .m(4) 3346 .n(n) 3347 .k(k) 3348 .a_stride(43) 3349 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3350 } 3351 } 3352 } 3353 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,n_gt_8_subtile)3354 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_8_subtile) { 3355 TEST_REQUIRES_ARM_NEON_DOT; 3356 for (uint32_t n = 9; n < 16; n++) { 3357 for (size_t k = 1; k <= 40; k += 9) { 3358 for (uint32_t m = 1; m <= 4; m++) { 3359 GemmMicrokernelTester() 3360 .mr(4) 3361 .nr(8) 3362 .kr(4) 3363 .sr(1) 3364 .m(m) 3365 .n(n) 3366 .k(k) 3367 .iterations(1) 3368 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3369 } 3370 } 3371 } 3372 } 3373 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,n_div_8)3374 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, n_div_8) { 3375 TEST_REQUIRES_ARM_NEON_DOT; 3376 for (uint32_t n = 16; n <= 24; n += 8) { 3377 for (size_t k = 1; k <= 40; k += 9) { 3378 GemmMicrokernelTester() 3379 .mr(4) 3380 .nr(8) 3381 .kr(4) 3382 .sr(1) 3383 .m(4) 3384 .n(n) 3385 .k(k) 3386 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3387 } 3388 } 3389 } 3390 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,n_div_8_strided_cn)3391 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, n_div_8_strided_cn) { 3392 TEST_REQUIRES_ARM_NEON_DOT; 3393 for (uint32_t n = 16; n <= 24; n += 8) { 3394 for (size_t k = 1; k <= 40; k += 9) { 3395 GemmMicrokernelTester() 3396 .mr(4) 3397 .nr(8) 3398 .kr(4) 3399 .sr(1) 3400 .m(4) 3401 .n(n) 3402 .k(k) 3403 .cn_stride(11) 3404 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3405 } 3406 } 3407 } 3408 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,n_div_8_strided_a)3409 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, n_div_8_strided_a) { 3410 TEST_REQUIRES_ARM_NEON_DOT; 3411 for (uint32_t n = 16; n <= 24; n += 8) { 3412 for (size_t k = 1; k <= 40; k += 9) { 3413 GemmMicrokernelTester() 3414 .mr(4) 3415 .nr(8) 3416 .kr(4) 3417 .sr(1) 3418 .m(4) 3419 .n(n) 3420 .k(k) 3421 .a_stride(43) 3422 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3423 } 3424 } 3425 } 3426 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,n_div_8_subtile)3427 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, n_div_8_subtile) { 3428 TEST_REQUIRES_ARM_NEON_DOT; 3429 for (uint32_t n = 16; n <= 24; n += 8) { 3430 for (size_t k = 1; k <= 40; k += 9) { 3431 for (uint32_t m = 1; m <= 4; m++) { 3432 GemmMicrokernelTester() 3433 .mr(4) 3434 .nr(8) 3435 .kr(4) 3436 .sr(1) 3437 .m(m) 3438 .n(n) 3439 .k(k) 3440 .iterations(1) 3441 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3442 } 3443 } 3444 } 3445 } 3446 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,strided_cm_subtile)3447 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) { 3448 TEST_REQUIRES_ARM_NEON_DOT; 3449 for (size_t k = 1; k <= 40; k += 9) { 3450 for (uint32_t n = 1; n <= 8; n++) { 3451 for (uint32_t m = 1; m <= 4; m++) { 3452 GemmMicrokernelTester() 3453 .mr(4) 3454 .nr(8) 3455 .kr(4) 3456 .sr(1) 3457 .m(m) 3458 .n(n) 3459 .k(k) 3460 .cm_stride(11) 3461 .iterations(1) 3462 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3463 } 3464 } 3465 } 3466 } 3467 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,qmin)3468 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, qmin) { 3469 TEST_REQUIRES_ARM_NEON_DOT; 3470 GemmMicrokernelTester() 3471 .mr(4) 3472 .nr(8) 3473 .kr(4) 3474 .sr(1) 3475 .m(4) 3476 .n(8) 3477 .k(8) 3478 .qmin(128) 3479 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3480 } 3481 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,qmax)3482 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, qmax) { 3483 TEST_REQUIRES_ARM_NEON_DOT; 3484 GemmMicrokernelTester() 3485 .mr(4) 3486 .nr(8) 3487 .kr(4) 3488 .sr(1) 3489 .m(4) 3490 .n(8) 3491 .k(8) 3492 .qmax(128) 3493 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3494 } 3495 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,strided_cm)3496 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) { 3497 TEST_REQUIRES_ARM_NEON_DOT; 3498 GemmMicrokernelTester() 3499 .mr(4) 3500 .nr(8) 3501 .kr(4) 3502 .sr(1) 3503 .m(4) 3504 .n(8) 3505 .k(8) 3506 .cm_stride(11) 3507 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3508 } 3509 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,no_a_zero_point)3510 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, no_a_zero_point) { 3511 TEST_REQUIRES_ARM_NEON_DOT; 3512 for (size_t k = 1; k <= 40; k += 9) { 3513 GemmMicrokernelTester() 3514 .mr(4) 3515 .nr(8) 3516 .kr(4) 3517 .sr(1) 3518 .m(4) 3519 .n(8) 3520 .k(k) 3521 .a_zero_point(0) 3522 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3523 } 3524 } 3525 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,no_b_zero_point)3526 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, no_b_zero_point) { 3527 TEST_REQUIRES_ARM_NEON_DOT; 3528 for (size_t k = 1; k <= 40; k += 9) { 3529 GemmMicrokernelTester() 3530 .mr(4) 3531 .nr(8) 3532 .kr(4) 3533 .sr(1) 3534 .m(4) 3535 .n(8) 3536 .k(k) 3537 .b_zero_point(0) 3538 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3539 } 3540 } 3541 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55,no_zero_point)3542 TEST(QU8_GEMM_MINMAX_RNDNU_4X8C4__AARCH64_NEONDOT_CORTEX_A55, no_zero_point) { 3543 TEST_REQUIRES_ARM_NEON_DOT; 3544 for (size_t k = 1; k <= 40; k += 9) { 3545 GemmMicrokernelTester() 3546 .mr(4) 3547 .nr(8) 3548 .kr(4) 3549 .sr(1) 3550 .m(4) 3551 .n(8) 3552 .k(k) 3553 .a_zero_point(0) 3554 .b_zero_point(0) 3555 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3556 } 3557 } 3558 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 3559 3560 3561 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_eq_8)3562 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8) { 3563 TEST_REQUIRES_ARM_NEON_DOT; 3564 GemmMicrokernelTester() 3565 .mr(1) 3566 .nr(8) 3567 .kr(4) 3568 .sr(1) 3569 .m(1) 3570 .n(8) 3571 .k(8) 3572 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3573 } 3574 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,strided_cn)3575 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cn) { 3576 TEST_REQUIRES_ARM_NEON_DOT; 3577 GemmMicrokernelTester() 3578 .mr(1) 3579 .nr(8) 3580 .kr(4) 3581 .sr(1) 3582 .m(1) 3583 .n(8) 3584 .k(8) 3585 .cn_stride(11) 3586 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3587 } 3588 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_eq_8_strided_a)3589 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_strided_a) { 3590 TEST_REQUIRES_ARM_NEON_DOT; 3591 GemmMicrokernelTester() 3592 .mr(1) 3593 .nr(8) 3594 .kr(4) 3595 .sr(1) 3596 .m(1) 3597 .n(8) 3598 .k(8) 3599 .a_stride(11) 3600 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3601 } 3602 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_eq_8_subtile)3603 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile) { 3604 TEST_REQUIRES_ARM_NEON_DOT; 3605 for (uint32_t n = 1; n <= 8; n++) { 3606 for (uint32_t m = 1; m <= 1; m++) { 3607 GemmMicrokernelTester() 3608 .mr(1) 3609 .nr(8) 3610 .kr(4) 3611 .sr(1) 3612 .m(m) 3613 .n(n) 3614 .k(8) 3615 .iterations(1) 3616 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3617 } 3618 } 3619 } 3620 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_eq_8_subtile_m)3621 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile_m) { 3622 TEST_REQUIRES_ARM_NEON_DOT; 3623 for (uint32_t m = 1; m <= 1; m++) { 3624 GemmMicrokernelTester() 3625 .mr(1) 3626 .nr(8) 3627 .kr(4) 3628 .sr(1) 3629 .m(m) 3630 .n(8) 3631 .k(8) 3632 .iterations(1) 3633 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3634 } 3635 } 3636 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_eq_8_subtile_n)3637 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_eq_8_subtile_n) { 3638 TEST_REQUIRES_ARM_NEON_DOT; 3639 for (uint32_t n = 1; n <= 8; n++) { 3640 GemmMicrokernelTester() 3641 .mr(1) 3642 .nr(8) 3643 .kr(4) 3644 .sr(1) 3645 .m(1) 3646 .n(n) 3647 .k(8) 3648 .iterations(1) 3649 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3650 } 3651 } 3652 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_lt_8)3653 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_lt_8) { 3654 TEST_REQUIRES_ARM_NEON_DOT; 3655 for (size_t k = 1; k < 8; k++) { 3656 GemmMicrokernelTester() 3657 .mr(1) 3658 .nr(8) 3659 .kr(4) 3660 .sr(1) 3661 .m(1) 3662 .n(8) 3663 .k(k) 3664 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3665 } 3666 } 3667 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_lt_8_strided_a)3668 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_lt_8_strided_a) { 3669 TEST_REQUIRES_ARM_NEON_DOT; 3670 for (size_t k = 1; k < 8; k++) { 3671 GemmMicrokernelTester() 3672 .mr(1) 3673 .nr(8) 3674 .kr(4) 3675 .sr(1) 3676 .m(1) 3677 .n(8) 3678 .k(k) 3679 .a_stride(11) 3680 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3681 } 3682 } 3683 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_lt_8_subtile)3684 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_lt_8_subtile) { 3685 TEST_REQUIRES_ARM_NEON_DOT; 3686 for (size_t k = 1; k < 8; k++) { 3687 for (uint32_t n = 1; n <= 8; n++) { 3688 for (uint32_t m = 1; m <= 1; m++) { 3689 GemmMicrokernelTester() 3690 .mr(1) 3691 .nr(8) 3692 .kr(4) 3693 .sr(1) 3694 .m(m) 3695 .n(n) 3696 .k(k) 3697 .iterations(1) 3698 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3699 } 3700 } 3701 } 3702 } 3703 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_gt_8)3704 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_gt_8) { 3705 TEST_REQUIRES_ARM_NEON_DOT; 3706 for (size_t k = 9; k < 16; k++) { 3707 GemmMicrokernelTester() 3708 .mr(1) 3709 .nr(8) 3710 .kr(4) 3711 .sr(1) 3712 .m(1) 3713 .n(8) 3714 .k(k) 3715 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3716 } 3717 } 3718 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_gt_8_strided_a)3719 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_gt_8_strided_a) { 3720 TEST_REQUIRES_ARM_NEON_DOT; 3721 for (size_t k = 9; k < 16; k++) { 3722 GemmMicrokernelTester() 3723 .mr(1) 3724 .nr(8) 3725 .kr(4) 3726 .sr(1) 3727 .m(1) 3728 .n(8) 3729 .k(k) 3730 .a_stride(19) 3731 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3732 } 3733 } 3734 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_gt_8_subtile)3735 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_gt_8_subtile) { 3736 TEST_REQUIRES_ARM_NEON_DOT; 3737 for (size_t k = 9; k < 16; k++) { 3738 for (uint32_t n = 1; n <= 8; n++) { 3739 for (uint32_t m = 1; m <= 1; m++) { 3740 GemmMicrokernelTester() 3741 .mr(1) 3742 .nr(8) 3743 .kr(4) 3744 .sr(1) 3745 .m(m) 3746 .n(n) 3747 .k(k) 3748 .iterations(1) 3749 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3750 } 3751 } 3752 } 3753 } 3754 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_div_8)3755 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_div_8) { 3756 TEST_REQUIRES_ARM_NEON_DOT; 3757 for (size_t k = 16; k <= 80; k += 8) { 3758 GemmMicrokernelTester() 3759 .mr(1) 3760 .nr(8) 3761 .kr(4) 3762 .sr(1) 3763 .m(1) 3764 .n(8) 3765 .k(k) 3766 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3767 } 3768 } 3769 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_div_8_strided_a)3770 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_div_8_strided_a) { 3771 TEST_REQUIRES_ARM_NEON_DOT; 3772 for (size_t k = 16; k <= 80; k += 8) { 3773 GemmMicrokernelTester() 3774 .mr(1) 3775 .nr(8) 3776 .kr(4) 3777 .sr(1) 3778 .m(1) 3779 .n(8) 3780 .k(k) 3781 .a_stride(83) 3782 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3783 } 3784 } 3785 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,k_div_8_subtile)3786 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, k_div_8_subtile) { 3787 TEST_REQUIRES_ARM_NEON_DOT; 3788 for (size_t k = 16; k <= 80; k += 8) { 3789 for (uint32_t n = 1; n <= 8; n++) { 3790 for (uint32_t m = 1; m <= 1; m++) { 3791 GemmMicrokernelTester() 3792 .mr(1) 3793 .nr(8) 3794 .kr(4) 3795 .sr(1) 3796 .m(m) 3797 .n(n) 3798 .k(k) 3799 .iterations(1) 3800 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3801 } 3802 } 3803 } 3804 } 3805 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,n_gt_8)3806 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8) { 3807 TEST_REQUIRES_ARM_NEON_DOT; 3808 for (uint32_t n = 9; n < 16; n++) { 3809 for (size_t k = 1; k <= 40; k += 9) { 3810 GemmMicrokernelTester() 3811 .mr(1) 3812 .nr(8) 3813 .kr(4) 3814 .sr(1) 3815 .m(1) 3816 .n(n) 3817 .k(k) 3818 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3819 } 3820 } 3821 } 3822 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,n_gt_8_strided_cn)3823 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_strided_cn) { 3824 TEST_REQUIRES_ARM_NEON_DOT; 3825 for (uint32_t n = 9; n < 16; n++) { 3826 for (size_t k = 1; k <= 40; k += 9) { 3827 GemmMicrokernelTester() 3828 .mr(1) 3829 .nr(8) 3830 .kr(4) 3831 .sr(1) 3832 .m(1) 3833 .n(n) 3834 .k(k) 3835 .cn_stride(11) 3836 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3837 } 3838 } 3839 } 3840 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,n_gt_8_strided_a)3841 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_strided_a) { 3842 TEST_REQUIRES_ARM_NEON_DOT; 3843 for (uint32_t n = 9; n < 16; n++) { 3844 for (size_t k = 1; k <= 40; k += 9) { 3845 GemmMicrokernelTester() 3846 .mr(1) 3847 .nr(8) 3848 .kr(4) 3849 .sr(1) 3850 .m(1) 3851 .n(n) 3852 .k(k) 3853 .a_stride(43) 3854 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3855 } 3856 } 3857 } 3858 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,n_gt_8_subtile)3859 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_gt_8_subtile) { 3860 TEST_REQUIRES_ARM_NEON_DOT; 3861 for (uint32_t n = 9; n < 16; n++) { 3862 for (size_t k = 1; k <= 40; k += 9) { 3863 for (uint32_t m = 1; m <= 1; m++) { 3864 GemmMicrokernelTester() 3865 .mr(1) 3866 .nr(8) 3867 .kr(4) 3868 .sr(1) 3869 .m(m) 3870 .n(n) 3871 .k(k) 3872 .iterations(1) 3873 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3874 } 3875 } 3876 } 3877 } 3878 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,n_div_8)3879 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8) { 3880 TEST_REQUIRES_ARM_NEON_DOT; 3881 for (uint32_t n = 16; n <= 24; n += 8) { 3882 for (size_t k = 1; k <= 40; k += 9) { 3883 GemmMicrokernelTester() 3884 .mr(1) 3885 .nr(8) 3886 .kr(4) 3887 .sr(1) 3888 .m(1) 3889 .n(n) 3890 .k(k) 3891 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3892 } 3893 } 3894 } 3895 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,n_div_8_strided_cn)3896 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_strided_cn) { 3897 TEST_REQUIRES_ARM_NEON_DOT; 3898 for (uint32_t n = 16; n <= 24; n += 8) { 3899 for (size_t k = 1; k <= 40; k += 9) { 3900 GemmMicrokernelTester() 3901 .mr(1) 3902 .nr(8) 3903 .kr(4) 3904 .sr(1) 3905 .m(1) 3906 .n(n) 3907 .k(k) 3908 .cn_stride(11) 3909 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3910 } 3911 } 3912 } 3913 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,n_div_8_strided_a)3914 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_strided_a) { 3915 TEST_REQUIRES_ARM_NEON_DOT; 3916 for (uint32_t n = 16; n <= 24; n += 8) { 3917 for (size_t k = 1; k <= 40; k += 9) { 3918 GemmMicrokernelTester() 3919 .mr(1) 3920 .nr(8) 3921 .kr(4) 3922 .sr(1) 3923 .m(1) 3924 .n(n) 3925 .k(k) 3926 .a_stride(43) 3927 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3928 } 3929 } 3930 } 3931 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,n_div_8_subtile)3932 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, n_div_8_subtile) { 3933 TEST_REQUIRES_ARM_NEON_DOT; 3934 for (uint32_t n = 16; n <= 24; n += 8) { 3935 for (size_t k = 1; k <= 40; k += 9) { 3936 for (uint32_t m = 1; m <= 1; m++) { 3937 GemmMicrokernelTester() 3938 .mr(1) 3939 .nr(8) 3940 .kr(4) 3941 .sr(1) 3942 .m(m) 3943 .n(n) 3944 .k(k) 3945 .iterations(1) 3946 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3947 } 3948 } 3949 } 3950 } 3951 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,strided_cm_subtile)3952 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cm_subtile) { 3953 TEST_REQUIRES_ARM_NEON_DOT; 3954 for (size_t k = 1; k <= 40; k += 9) { 3955 for (uint32_t n = 1; n <= 8; n++) { 3956 for (uint32_t m = 1; m <= 1; m++) { 3957 GemmMicrokernelTester() 3958 .mr(1) 3959 .nr(8) 3960 .kr(4) 3961 .sr(1) 3962 .m(m) 3963 .n(n) 3964 .k(k) 3965 .cm_stride(11) 3966 .iterations(1) 3967 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3968 } 3969 } 3970 } 3971 } 3972 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,qmin)3973 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, qmin) { 3974 TEST_REQUIRES_ARM_NEON_DOT; 3975 GemmMicrokernelTester() 3976 .mr(1) 3977 .nr(8) 3978 .kr(4) 3979 .sr(1) 3980 .m(1) 3981 .n(8) 3982 .k(8) 3983 .qmin(128) 3984 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3985 } 3986 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,qmax)3987 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, qmax) { 3988 TEST_REQUIRES_ARM_NEON_DOT; 3989 GemmMicrokernelTester() 3990 .mr(1) 3991 .nr(8) 3992 .kr(4) 3993 .sr(1) 3994 .m(1) 3995 .n(8) 3996 .k(8) 3997 .qmax(128) 3998 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 3999 } 4000 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,strided_cm)4001 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, strided_cm) { 4002 TEST_REQUIRES_ARM_NEON_DOT; 4003 GemmMicrokernelTester() 4004 .mr(1) 4005 .nr(8) 4006 .kr(4) 4007 .sr(1) 4008 .m(1) 4009 .n(8) 4010 .k(8) 4011 .cm_stride(11) 4012 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4013 } 4014 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,no_a_zero_point)4015 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, no_a_zero_point) { 4016 TEST_REQUIRES_ARM_NEON_DOT; 4017 for (size_t k = 1; k <= 40; k += 9) { 4018 GemmMicrokernelTester() 4019 .mr(1) 4020 .nr(8) 4021 .kr(4) 4022 .sr(1) 4023 .m(1) 4024 .n(8) 4025 .k(k) 4026 .a_zero_point(0) 4027 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4028 } 4029 } 4030 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,no_b_zero_point)4031 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, no_b_zero_point) { 4032 TEST_REQUIRES_ARM_NEON_DOT; 4033 for (size_t k = 1; k <= 40; k += 9) { 4034 GemmMicrokernelTester() 4035 .mr(1) 4036 .nr(8) 4037 .kr(4) 4038 .sr(1) 4039 .m(1) 4040 .n(8) 4041 .k(k) 4042 .b_zero_point(0) 4043 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4044 } 4045 } 4046 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT,no_zero_point)4047 TEST(QU8_GEMM_MINMAX_RNDNU_1X8C4__NEONDOT, no_zero_point) { 4048 TEST_REQUIRES_ARM_NEON_DOT; 4049 for (size_t k = 1; k <= 40; k += 9) { 4050 GemmMicrokernelTester() 4051 .mr(1) 4052 .nr(8) 4053 .kr(4) 4054 .sr(1) 4055 .m(1) 4056 .n(8) 4057 .k(k) 4058 .a_zero_point(0) 4059 .b_zero_point(0) 4060 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4061 } 4062 } 4063 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 4064 4065 4066 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_eq_8)4067 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_eq_8) { 4068 TEST_REQUIRES_ARM_NEON_DOT; 4069 GemmMicrokernelTester() 4070 .mr(6) 4071 .nr(8) 4072 .kr(4) 4073 .sr(1) 4074 .m(6) 4075 .n(8) 4076 .k(8) 4077 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4078 } 4079 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,strided_cn)4080 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, strided_cn) { 4081 TEST_REQUIRES_ARM_NEON_DOT; 4082 GemmMicrokernelTester() 4083 .mr(6) 4084 .nr(8) 4085 .kr(4) 4086 .sr(1) 4087 .m(6) 4088 .n(8) 4089 .k(8) 4090 .cn_stride(11) 4091 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4092 } 4093 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_eq_8_strided_a)4094 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_eq_8_strided_a) { 4095 TEST_REQUIRES_ARM_NEON_DOT; 4096 GemmMicrokernelTester() 4097 .mr(6) 4098 .nr(8) 4099 .kr(4) 4100 .sr(1) 4101 .m(6) 4102 .n(8) 4103 .k(8) 4104 .a_stride(11) 4105 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4106 } 4107 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_eq_8_subtile)4108 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_eq_8_subtile) { 4109 TEST_REQUIRES_ARM_NEON_DOT; 4110 for (uint32_t n = 1; n <= 8; n++) { 4111 for (uint32_t m = 1; m <= 6; m++) { 4112 GemmMicrokernelTester() 4113 .mr(6) 4114 .nr(8) 4115 .kr(4) 4116 .sr(1) 4117 .m(m) 4118 .n(n) 4119 .k(8) 4120 .iterations(1) 4121 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4122 } 4123 } 4124 } 4125 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_eq_8_subtile_m)4126 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_eq_8_subtile_m) { 4127 TEST_REQUIRES_ARM_NEON_DOT; 4128 for (uint32_t m = 1; m <= 6; m++) { 4129 GemmMicrokernelTester() 4130 .mr(6) 4131 .nr(8) 4132 .kr(4) 4133 .sr(1) 4134 .m(m) 4135 .n(8) 4136 .k(8) 4137 .iterations(1) 4138 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4139 } 4140 } 4141 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_eq_8_subtile_n)4142 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_eq_8_subtile_n) { 4143 TEST_REQUIRES_ARM_NEON_DOT; 4144 for (uint32_t n = 1; n <= 8; n++) { 4145 GemmMicrokernelTester() 4146 .mr(6) 4147 .nr(8) 4148 .kr(4) 4149 .sr(1) 4150 .m(6) 4151 .n(n) 4152 .k(8) 4153 .iterations(1) 4154 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4155 } 4156 } 4157 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_lt_8)4158 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_lt_8) { 4159 TEST_REQUIRES_ARM_NEON_DOT; 4160 for (size_t k = 1; k < 8; k++) { 4161 GemmMicrokernelTester() 4162 .mr(6) 4163 .nr(8) 4164 .kr(4) 4165 .sr(1) 4166 .m(6) 4167 .n(8) 4168 .k(k) 4169 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4170 } 4171 } 4172 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_lt_8_strided_a)4173 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_lt_8_strided_a) { 4174 TEST_REQUIRES_ARM_NEON_DOT; 4175 for (size_t k = 1; k < 8; k++) { 4176 GemmMicrokernelTester() 4177 .mr(6) 4178 .nr(8) 4179 .kr(4) 4180 .sr(1) 4181 .m(6) 4182 .n(8) 4183 .k(k) 4184 .a_stride(11) 4185 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4186 } 4187 } 4188 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_lt_8_subtile)4189 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_lt_8_subtile) { 4190 TEST_REQUIRES_ARM_NEON_DOT; 4191 for (size_t k = 1; k < 8; k++) { 4192 for (uint32_t n = 1; n <= 8; n++) { 4193 for (uint32_t m = 1; m <= 6; m++) { 4194 GemmMicrokernelTester() 4195 .mr(6) 4196 .nr(8) 4197 .kr(4) 4198 .sr(1) 4199 .m(m) 4200 .n(n) 4201 .k(k) 4202 .iterations(1) 4203 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4204 } 4205 } 4206 } 4207 } 4208 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_gt_8)4209 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_gt_8) { 4210 TEST_REQUIRES_ARM_NEON_DOT; 4211 for (size_t k = 9; k < 16; k++) { 4212 GemmMicrokernelTester() 4213 .mr(6) 4214 .nr(8) 4215 .kr(4) 4216 .sr(1) 4217 .m(6) 4218 .n(8) 4219 .k(k) 4220 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4221 } 4222 } 4223 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_gt_8_strided_a)4224 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_gt_8_strided_a) { 4225 TEST_REQUIRES_ARM_NEON_DOT; 4226 for (size_t k = 9; k < 16; k++) { 4227 GemmMicrokernelTester() 4228 .mr(6) 4229 .nr(8) 4230 .kr(4) 4231 .sr(1) 4232 .m(6) 4233 .n(8) 4234 .k(k) 4235 .a_stride(19) 4236 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4237 } 4238 } 4239 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_gt_8_subtile)4240 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_gt_8_subtile) { 4241 TEST_REQUIRES_ARM_NEON_DOT; 4242 for (size_t k = 9; k < 16; k++) { 4243 for (uint32_t n = 1; n <= 8; n++) { 4244 for (uint32_t m = 1; m <= 6; m++) { 4245 GemmMicrokernelTester() 4246 .mr(6) 4247 .nr(8) 4248 .kr(4) 4249 .sr(1) 4250 .m(m) 4251 .n(n) 4252 .k(k) 4253 .iterations(1) 4254 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4255 } 4256 } 4257 } 4258 } 4259 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_div_8)4260 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_div_8) { 4261 TEST_REQUIRES_ARM_NEON_DOT; 4262 for (size_t k = 16; k <= 80; k += 8) { 4263 GemmMicrokernelTester() 4264 .mr(6) 4265 .nr(8) 4266 .kr(4) 4267 .sr(1) 4268 .m(6) 4269 .n(8) 4270 .k(k) 4271 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4272 } 4273 } 4274 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_div_8_strided_a)4275 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_div_8_strided_a) { 4276 TEST_REQUIRES_ARM_NEON_DOT; 4277 for (size_t k = 16; k <= 80; k += 8) { 4278 GemmMicrokernelTester() 4279 .mr(6) 4280 .nr(8) 4281 .kr(4) 4282 .sr(1) 4283 .m(6) 4284 .n(8) 4285 .k(k) 4286 .a_stride(83) 4287 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4288 } 4289 } 4290 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,k_div_8_subtile)4291 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, k_div_8_subtile) { 4292 TEST_REQUIRES_ARM_NEON_DOT; 4293 for (size_t k = 16; k <= 80; k += 8) { 4294 for (uint32_t n = 1; n <= 8; n++) { 4295 for (uint32_t m = 1; m <= 6; m++) { 4296 GemmMicrokernelTester() 4297 .mr(6) 4298 .nr(8) 4299 .kr(4) 4300 .sr(1) 4301 .m(m) 4302 .n(n) 4303 .k(k) 4304 .iterations(1) 4305 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4306 } 4307 } 4308 } 4309 } 4310 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,n_gt_8)4311 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, n_gt_8) { 4312 TEST_REQUIRES_ARM_NEON_DOT; 4313 for (uint32_t n = 9; n < 16; n++) { 4314 for (size_t k = 1; k <= 40; k += 9) { 4315 GemmMicrokernelTester() 4316 .mr(6) 4317 .nr(8) 4318 .kr(4) 4319 .sr(1) 4320 .m(6) 4321 .n(n) 4322 .k(k) 4323 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4324 } 4325 } 4326 } 4327 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,n_gt_8_strided_cn)4328 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, n_gt_8_strided_cn) { 4329 TEST_REQUIRES_ARM_NEON_DOT; 4330 for (uint32_t n = 9; n < 16; n++) { 4331 for (size_t k = 1; k <= 40; k += 9) { 4332 GemmMicrokernelTester() 4333 .mr(6) 4334 .nr(8) 4335 .kr(4) 4336 .sr(1) 4337 .m(6) 4338 .n(n) 4339 .k(k) 4340 .cn_stride(11) 4341 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4342 } 4343 } 4344 } 4345 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,n_gt_8_strided_a)4346 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, n_gt_8_strided_a) { 4347 TEST_REQUIRES_ARM_NEON_DOT; 4348 for (uint32_t n = 9; n < 16; n++) { 4349 for (size_t k = 1; k <= 40; k += 9) { 4350 GemmMicrokernelTester() 4351 .mr(6) 4352 .nr(8) 4353 .kr(4) 4354 .sr(1) 4355 .m(6) 4356 .n(n) 4357 .k(k) 4358 .a_stride(43) 4359 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4360 } 4361 } 4362 } 4363 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,n_gt_8_subtile)4364 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, n_gt_8_subtile) { 4365 TEST_REQUIRES_ARM_NEON_DOT; 4366 for (uint32_t n = 9; n < 16; n++) { 4367 for (size_t k = 1; k <= 40; k += 9) { 4368 for (uint32_t m = 1; m <= 6; m++) { 4369 GemmMicrokernelTester() 4370 .mr(6) 4371 .nr(8) 4372 .kr(4) 4373 .sr(1) 4374 .m(m) 4375 .n(n) 4376 .k(k) 4377 .iterations(1) 4378 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4379 } 4380 } 4381 } 4382 } 4383 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,n_div_8)4384 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, n_div_8) { 4385 TEST_REQUIRES_ARM_NEON_DOT; 4386 for (uint32_t n = 16; n <= 24; n += 8) { 4387 for (size_t k = 1; k <= 40; k += 9) { 4388 GemmMicrokernelTester() 4389 .mr(6) 4390 .nr(8) 4391 .kr(4) 4392 .sr(1) 4393 .m(6) 4394 .n(n) 4395 .k(k) 4396 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4397 } 4398 } 4399 } 4400 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,n_div_8_strided_cn)4401 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, n_div_8_strided_cn) { 4402 TEST_REQUIRES_ARM_NEON_DOT; 4403 for (uint32_t n = 16; n <= 24; n += 8) { 4404 for (size_t k = 1; k <= 40; k += 9) { 4405 GemmMicrokernelTester() 4406 .mr(6) 4407 .nr(8) 4408 .kr(4) 4409 .sr(1) 4410 .m(6) 4411 .n(n) 4412 .k(k) 4413 .cn_stride(11) 4414 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4415 } 4416 } 4417 } 4418 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,n_div_8_strided_a)4419 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, n_div_8_strided_a) { 4420 TEST_REQUIRES_ARM_NEON_DOT; 4421 for (uint32_t n = 16; n <= 24; n += 8) { 4422 for (size_t k = 1; k <= 40; k += 9) { 4423 GemmMicrokernelTester() 4424 .mr(6) 4425 .nr(8) 4426 .kr(4) 4427 .sr(1) 4428 .m(6) 4429 .n(n) 4430 .k(k) 4431 .a_stride(43) 4432 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4433 } 4434 } 4435 } 4436 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,n_div_8_subtile)4437 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, n_div_8_subtile) { 4438 TEST_REQUIRES_ARM_NEON_DOT; 4439 for (uint32_t n = 16; n <= 24; n += 8) { 4440 for (size_t k = 1; k <= 40; k += 9) { 4441 for (uint32_t m = 1; m <= 6; m++) { 4442 GemmMicrokernelTester() 4443 .mr(6) 4444 .nr(8) 4445 .kr(4) 4446 .sr(1) 4447 .m(m) 4448 .n(n) 4449 .k(k) 4450 .iterations(1) 4451 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4452 } 4453 } 4454 } 4455 } 4456 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,strided_cm_subtile)4457 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, strided_cm_subtile) { 4458 TEST_REQUIRES_ARM_NEON_DOT; 4459 for (size_t k = 1; k <= 40; k += 9) { 4460 for (uint32_t n = 1; n <= 8; n++) { 4461 for (uint32_t m = 1; m <= 6; m++) { 4462 GemmMicrokernelTester() 4463 .mr(6) 4464 .nr(8) 4465 .kr(4) 4466 .sr(1) 4467 .m(m) 4468 .n(n) 4469 .k(k) 4470 .cm_stride(11) 4471 .iterations(1) 4472 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4473 } 4474 } 4475 } 4476 } 4477 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,qmin)4478 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, qmin) { 4479 TEST_REQUIRES_ARM_NEON_DOT; 4480 GemmMicrokernelTester() 4481 .mr(6) 4482 .nr(8) 4483 .kr(4) 4484 .sr(1) 4485 .m(6) 4486 .n(8) 4487 .k(8) 4488 .qmin(128) 4489 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4490 } 4491 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,qmax)4492 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, qmax) { 4493 TEST_REQUIRES_ARM_NEON_DOT; 4494 GemmMicrokernelTester() 4495 .mr(6) 4496 .nr(8) 4497 .kr(4) 4498 .sr(1) 4499 .m(6) 4500 .n(8) 4501 .k(8) 4502 .qmax(128) 4503 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4504 } 4505 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,strided_cm)4506 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, strided_cm) { 4507 TEST_REQUIRES_ARM_NEON_DOT; 4508 GemmMicrokernelTester() 4509 .mr(6) 4510 .nr(8) 4511 .kr(4) 4512 .sr(1) 4513 .m(6) 4514 .n(8) 4515 .k(8) 4516 .cm_stride(11) 4517 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4518 } 4519 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,no_a_zero_point)4520 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, no_a_zero_point) { 4521 TEST_REQUIRES_ARM_NEON_DOT; 4522 for (size_t k = 1; k <= 40; k += 9) { 4523 GemmMicrokernelTester() 4524 .mr(6) 4525 .nr(8) 4526 .kr(4) 4527 .sr(1) 4528 .m(6) 4529 .n(8) 4530 .k(k) 4531 .a_zero_point(0) 4532 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4533 } 4534 } 4535 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,no_b_zero_point)4536 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, no_b_zero_point) { 4537 TEST_REQUIRES_ARM_NEON_DOT; 4538 for (size_t k = 1; k <= 40; k += 9) { 4539 GemmMicrokernelTester() 4540 .mr(6) 4541 .nr(8) 4542 .kr(4) 4543 .sr(1) 4544 .m(6) 4545 .n(8) 4546 .k(k) 4547 .b_zero_point(0) 4548 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4549 } 4550 } 4551 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT,no_zero_point)4552 TEST(QU8_GEMM_MINMAX_RNDNU_6X8C4__NEONDOT, no_zero_point) { 4553 TEST_REQUIRES_ARM_NEON_DOT; 4554 for (size_t k = 1; k <= 40; k += 9) { 4555 GemmMicrokernelTester() 4556 .mr(6) 4557 .nr(8) 4558 .kr(4) 4559 .sr(1) 4560 .m(6) 4561 .n(8) 4562 .k(k) 4563 .a_zero_point(0) 4564 .b_zero_point(0) 4565 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4566 } 4567 } 4568 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 4569 4570 4571 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_eq_8)4572 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_eq_8) { 4573 TEST_REQUIRES_ARM_NEON_DOT; 4574 GemmMicrokernelTester() 4575 .mr(8) 4576 .nr(8) 4577 .kr(4) 4578 .sr(1) 4579 .m(8) 4580 .n(8) 4581 .k(8) 4582 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4583 } 4584 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,strided_cn)4585 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, strided_cn) { 4586 TEST_REQUIRES_ARM_NEON_DOT; 4587 GemmMicrokernelTester() 4588 .mr(8) 4589 .nr(8) 4590 .kr(4) 4591 .sr(1) 4592 .m(8) 4593 .n(8) 4594 .k(8) 4595 .cn_stride(11) 4596 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4597 } 4598 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_eq_8_strided_a)4599 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_eq_8_strided_a) { 4600 TEST_REQUIRES_ARM_NEON_DOT; 4601 GemmMicrokernelTester() 4602 .mr(8) 4603 .nr(8) 4604 .kr(4) 4605 .sr(1) 4606 .m(8) 4607 .n(8) 4608 .k(8) 4609 .a_stride(11) 4610 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4611 } 4612 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_eq_8_subtile)4613 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_eq_8_subtile) { 4614 TEST_REQUIRES_ARM_NEON_DOT; 4615 for (uint32_t n = 1; n <= 8; n++) { 4616 for (uint32_t m = 1; m <= 8; m++) { 4617 GemmMicrokernelTester() 4618 .mr(8) 4619 .nr(8) 4620 .kr(4) 4621 .sr(1) 4622 .m(m) 4623 .n(n) 4624 .k(8) 4625 .iterations(1) 4626 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4627 } 4628 } 4629 } 4630 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_eq_8_subtile_m)4631 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_eq_8_subtile_m) { 4632 TEST_REQUIRES_ARM_NEON_DOT; 4633 for (uint32_t m = 1; m <= 8; m++) { 4634 GemmMicrokernelTester() 4635 .mr(8) 4636 .nr(8) 4637 .kr(4) 4638 .sr(1) 4639 .m(m) 4640 .n(8) 4641 .k(8) 4642 .iterations(1) 4643 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4644 } 4645 } 4646 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_eq_8_subtile_n)4647 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_eq_8_subtile_n) { 4648 TEST_REQUIRES_ARM_NEON_DOT; 4649 for (uint32_t n = 1; n <= 8; n++) { 4650 GemmMicrokernelTester() 4651 .mr(8) 4652 .nr(8) 4653 .kr(4) 4654 .sr(1) 4655 .m(8) 4656 .n(n) 4657 .k(8) 4658 .iterations(1) 4659 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4660 } 4661 } 4662 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_lt_8)4663 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_lt_8) { 4664 TEST_REQUIRES_ARM_NEON_DOT; 4665 for (size_t k = 1; k < 8; k++) { 4666 GemmMicrokernelTester() 4667 .mr(8) 4668 .nr(8) 4669 .kr(4) 4670 .sr(1) 4671 .m(8) 4672 .n(8) 4673 .k(k) 4674 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4675 } 4676 } 4677 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_lt_8_strided_a)4678 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_lt_8_strided_a) { 4679 TEST_REQUIRES_ARM_NEON_DOT; 4680 for (size_t k = 1; k < 8; k++) { 4681 GemmMicrokernelTester() 4682 .mr(8) 4683 .nr(8) 4684 .kr(4) 4685 .sr(1) 4686 .m(8) 4687 .n(8) 4688 .k(k) 4689 .a_stride(11) 4690 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4691 } 4692 } 4693 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_lt_8_subtile)4694 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_lt_8_subtile) { 4695 TEST_REQUIRES_ARM_NEON_DOT; 4696 for (size_t k = 1; k < 8; k++) { 4697 for (uint32_t n = 1; n <= 8; n++) { 4698 for (uint32_t m = 1; m <= 8; m++) { 4699 GemmMicrokernelTester() 4700 .mr(8) 4701 .nr(8) 4702 .kr(4) 4703 .sr(1) 4704 .m(m) 4705 .n(n) 4706 .k(k) 4707 .iterations(1) 4708 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4709 } 4710 } 4711 } 4712 } 4713 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_gt_8)4714 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_gt_8) { 4715 TEST_REQUIRES_ARM_NEON_DOT; 4716 for (size_t k = 9; k < 16; k++) { 4717 GemmMicrokernelTester() 4718 .mr(8) 4719 .nr(8) 4720 .kr(4) 4721 .sr(1) 4722 .m(8) 4723 .n(8) 4724 .k(k) 4725 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4726 } 4727 } 4728 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_gt_8_strided_a)4729 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_gt_8_strided_a) { 4730 TEST_REQUIRES_ARM_NEON_DOT; 4731 for (size_t k = 9; k < 16; k++) { 4732 GemmMicrokernelTester() 4733 .mr(8) 4734 .nr(8) 4735 .kr(4) 4736 .sr(1) 4737 .m(8) 4738 .n(8) 4739 .k(k) 4740 .a_stride(19) 4741 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4742 } 4743 } 4744 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_gt_8_subtile)4745 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_gt_8_subtile) { 4746 TEST_REQUIRES_ARM_NEON_DOT; 4747 for (size_t k = 9; k < 16; k++) { 4748 for (uint32_t n = 1; n <= 8; n++) { 4749 for (uint32_t m = 1; m <= 8; m++) { 4750 GemmMicrokernelTester() 4751 .mr(8) 4752 .nr(8) 4753 .kr(4) 4754 .sr(1) 4755 .m(m) 4756 .n(n) 4757 .k(k) 4758 .iterations(1) 4759 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4760 } 4761 } 4762 } 4763 } 4764 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_div_8)4765 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_div_8) { 4766 TEST_REQUIRES_ARM_NEON_DOT; 4767 for (size_t k = 16; k <= 80; k += 8) { 4768 GemmMicrokernelTester() 4769 .mr(8) 4770 .nr(8) 4771 .kr(4) 4772 .sr(1) 4773 .m(8) 4774 .n(8) 4775 .k(k) 4776 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4777 } 4778 } 4779 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_div_8_strided_a)4780 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_div_8_strided_a) { 4781 TEST_REQUIRES_ARM_NEON_DOT; 4782 for (size_t k = 16; k <= 80; k += 8) { 4783 GemmMicrokernelTester() 4784 .mr(8) 4785 .nr(8) 4786 .kr(4) 4787 .sr(1) 4788 .m(8) 4789 .n(8) 4790 .k(k) 4791 .a_stride(83) 4792 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4793 } 4794 } 4795 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,k_div_8_subtile)4796 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, k_div_8_subtile) { 4797 TEST_REQUIRES_ARM_NEON_DOT; 4798 for (size_t k = 16; k <= 80; k += 8) { 4799 for (uint32_t n = 1; n <= 8; n++) { 4800 for (uint32_t m = 1; m <= 8; m++) { 4801 GemmMicrokernelTester() 4802 .mr(8) 4803 .nr(8) 4804 .kr(4) 4805 .sr(1) 4806 .m(m) 4807 .n(n) 4808 .k(k) 4809 .iterations(1) 4810 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4811 } 4812 } 4813 } 4814 } 4815 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,n_gt_8)4816 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, n_gt_8) { 4817 TEST_REQUIRES_ARM_NEON_DOT; 4818 for (uint32_t n = 9; n < 16; n++) { 4819 for (size_t k = 1; k <= 40; k += 9) { 4820 GemmMicrokernelTester() 4821 .mr(8) 4822 .nr(8) 4823 .kr(4) 4824 .sr(1) 4825 .m(8) 4826 .n(n) 4827 .k(k) 4828 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4829 } 4830 } 4831 } 4832 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,n_gt_8_strided_cn)4833 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, n_gt_8_strided_cn) { 4834 TEST_REQUIRES_ARM_NEON_DOT; 4835 for (uint32_t n = 9; n < 16; n++) { 4836 for (size_t k = 1; k <= 40; k += 9) { 4837 GemmMicrokernelTester() 4838 .mr(8) 4839 .nr(8) 4840 .kr(4) 4841 .sr(1) 4842 .m(8) 4843 .n(n) 4844 .k(k) 4845 .cn_stride(11) 4846 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4847 } 4848 } 4849 } 4850 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,n_gt_8_strided_a)4851 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, n_gt_8_strided_a) { 4852 TEST_REQUIRES_ARM_NEON_DOT; 4853 for (uint32_t n = 9; n < 16; n++) { 4854 for (size_t k = 1; k <= 40; k += 9) { 4855 GemmMicrokernelTester() 4856 .mr(8) 4857 .nr(8) 4858 .kr(4) 4859 .sr(1) 4860 .m(8) 4861 .n(n) 4862 .k(k) 4863 .a_stride(43) 4864 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4865 } 4866 } 4867 } 4868 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,n_gt_8_subtile)4869 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, n_gt_8_subtile) { 4870 TEST_REQUIRES_ARM_NEON_DOT; 4871 for (uint32_t n = 9; n < 16; n++) { 4872 for (size_t k = 1; k <= 40; k += 9) { 4873 for (uint32_t m = 1; m <= 8; m++) { 4874 GemmMicrokernelTester() 4875 .mr(8) 4876 .nr(8) 4877 .kr(4) 4878 .sr(1) 4879 .m(m) 4880 .n(n) 4881 .k(k) 4882 .iterations(1) 4883 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4884 } 4885 } 4886 } 4887 } 4888 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,n_div_8)4889 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, n_div_8) { 4890 TEST_REQUIRES_ARM_NEON_DOT; 4891 for (uint32_t n = 16; n <= 24; n += 8) { 4892 for (size_t k = 1; k <= 40; k += 9) { 4893 GemmMicrokernelTester() 4894 .mr(8) 4895 .nr(8) 4896 .kr(4) 4897 .sr(1) 4898 .m(8) 4899 .n(n) 4900 .k(k) 4901 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4902 } 4903 } 4904 } 4905 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,n_div_8_strided_cn)4906 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, n_div_8_strided_cn) { 4907 TEST_REQUIRES_ARM_NEON_DOT; 4908 for (uint32_t n = 16; n <= 24; n += 8) { 4909 for (size_t k = 1; k <= 40; k += 9) { 4910 GemmMicrokernelTester() 4911 .mr(8) 4912 .nr(8) 4913 .kr(4) 4914 .sr(1) 4915 .m(8) 4916 .n(n) 4917 .k(k) 4918 .cn_stride(11) 4919 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4920 } 4921 } 4922 } 4923 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,n_div_8_strided_a)4924 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, n_div_8_strided_a) { 4925 TEST_REQUIRES_ARM_NEON_DOT; 4926 for (uint32_t n = 16; n <= 24; n += 8) { 4927 for (size_t k = 1; k <= 40; k += 9) { 4928 GemmMicrokernelTester() 4929 .mr(8) 4930 .nr(8) 4931 .kr(4) 4932 .sr(1) 4933 .m(8) 4934 .n(n) 4935 .k(k) 4936 .a_stride(43) 4937 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4938 } 4939 } 4940 } 4941 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,n_div_8_subtile)4942 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, n_div_8_subtile) { 4943 TEST_REQUIRES_ARM_NEON_DOT; 4944 for (uint32_t n = 16; n <= 24; n += 8) { 4945 for (size_t k = 1; k <= 40; k += 9) { 4946 for (uint32_t m = 1; m <= 8; m++) { 4947 GemmMicrokernelTester() 4948 .mr(8) 4949 .nr(8) 4950 .kr(4) 4951 .sr(1) 4952 .m(m) 4953 .n(n) 4954 .k(k) 4955 .iterations(1) 4956 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4957 } 4958 } 4959 } 4960 } 4961 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,strided_cm_subtile)4962 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, strided_cm_subtile) { 4963 TEST_REQUIRES_ARM_NEON_DOT; 4964 for (size_t k = 1; k <= 40; k += 9) { 4965 for (uint32_t n = 1; n <= 8; n++) { 4966 for (uint32_t m = 1; m <= 8; m++) { 4967 GemmMicrokernelTester() 4968 .mr(8) 4969 .nr(8) 4970 .kr(4) 4971 .sr(1) 4972 .m(m) 4973 .n(n) 4974 .k(k) 4975 .cm_stride(11) 4976 .iterations(1) 4977 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4978 } 4979 } 4980 } 4981 } 4982 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,qmin)4983 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, qmin) { 4984 TEST_REQUIRES_ARM_NEON_DOT; 4985 GemmMicrokernelTester() 4986 .mr(8) 4987 .nr(8) 4988 .kr(4) 4989 .sr(1) 4990 .m(8) 4991 .n(8) 4992 .k(8) 4993 .qmin(128) 4994 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 4995 } 4996 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,qmax)4997 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, qmax) { 4998 TEST_REQUIRES_ARM_NEON_DOT; 4999 GemmMicrokernelTester() 5000 .mr(8) 5001 .nr(8) 5002 .kr(4) 5003 .sr(1) 5004 .m(8) 5005 .n(8) 5006 .k(8) 5007 .qmax(128) 5008 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5009 } 5010 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,strided_cm)5011 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, strided_cm) { 5012 TEST_REQUIRES_ARM_NEON_DOT; 5013 GemmMicrokernelTester() 5014 .mr(8) 5015 .nr(8) 5016 .kr(4) 5017 .sr(1) 5018 .m(8) 5019 .n(8) 5020 .k(8) 5021 .cm_stride(11) 5022 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5023 } 5024 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,no_a_zero_point)5025 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, no_a_zero_point) { 5026 TEST_REQUIRES_ARM_NEON_DOT; 5027 for (size_t k = 1; k <= 40; k += 9) { 5028 GemmMicrokernelTester() 5029 .mr(8) 5030 .nr(8) 5031 .kr(4) 5032 .sr(1) 5033 .m(8) 5034 .n(8) 5035 .k(k) 5036 .a_zero_point(0) 5037 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5038 } 5039 } 5040 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,no_b_zero_point)5041 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, no_b_zero_point) { 5042 TEST_REQUIRES_ARM_NEON_DOT; 5043 for (size_t k = 1; k <= 40; k += 9) { 5044 GemmMicrokernelTester() 5045 .mr(8) 5046 .nr(8) 5047 .kr(4) 5048 .sr(1) 5049 .m(8) 5050 .n(8) 5051 .k(k) 5052 .b_zero_point(0) 5053 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5054 } 5055 } 5056 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT,no_zero_point)5057 TEST(QU8_GEMM_MINMAX_RNDNU_8X8C4__NEONDOT, no_zero_point) { 5058 TEST_REQUIRES_ARM_NEON_DOT; 5059 for (size_t k = 1; k <= 40; k += 9) { 5060 GemmMicrokernelTester() 5061 .mr(8) 5062 .nr(8) 5063 .kr(4) 5064 .sr(1) 5065 .m(8) 5066 .n(8) 5067 .k(k) 5068 .a_zero_point(0) 5069 .b_zero_point(0) 5070 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5071 } 5072 } 5073 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 5074 5075 5076 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_eq_8)5077 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_eq_8) { 5078 TEST_REQUIRES_ARM_NEON_DOT; 5079 GemmMicrokernelTester() 5080 .mr(2) 5081 .nr(16) 5082 .kr(4) 5083 .sr(1) 5084 .m(2) 5085 .n(16) 5086 .k(8) 5087 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5088 } 5089 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,strided_cn)5090 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, strided_cn) { 5091 TEST_REQUIRES_ARM_NEON_DOT; 5092 GemmMicrokernelTester() 5093 .mr(2) 5094 .nr(16) 5095 .kr(4) 5096 .sr(1) 5097 .m(2) 5098 .n(16) 5099 .k(8) 5100 .cn_stride(19) 5101 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5102 } 5103 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_eq_8_strided_a)5104 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_eq_8_strided_a) { 5105 TEST_REQUIRES_ARM_NEON_DOT; 5106 GemmMicrokernelTester() 5107 .mr(2) 5108 .nr(16) 5109 .kr(4) 5110 .sr(1) 5111 .m(2) 5112 .n(16) 5113 .k(8) 5114 .a_stride(11) 5115 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5116 } 5117 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_eq_8_subtile)5118 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_eq_8_subtile) { 5119 TEST_REQUIRES_ARM_NEON_DOT; 5120 for (uint32_t n = 1; n <= 16; n++) { 5121 for (uint32_t m = 1; m <= 2; m++) { 5122 GemmMicrokernelTester() 5123 .mr(2) 5124 .nr(16) 5125 .kr(4) 5126 .sr(1) 5127 .m(m) 5128 .n(n) 5129 .k(8) 5130 .iterations(1) 5131 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5132 } 5133 } 5134 } 5135 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_eq_8_subtile_m)5136 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_eq_8_subtile_m) { 5137 TEST_REQUIRES_ARM_NEON_DOT; 5138 for (uint32_t m = 1; m <= 2; m++) { 5139 GemmMicrokernelTester() 5140 .mr(2) 5141 .nr(16) 5142 .kr(4) 5143 .sr(1) 5144 .m(m) 5145 .n(16) 5146 .k(8) 5147 .iterations(1) 5148 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5149 } 5150 } 5151 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_eq_8_subtile_n)5152 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_eq_8_subtile_n) { 5153 TEST_REQUIRES_ARM_NEON_DOT; 5154 for (uint32_t n = 1; n <= 16; n++) { 5155 GemmMicrokernelTester() 5156 .mr(2) 5157 .nr(16) 5158 .kr(4) 5159 .sr(1) 5160 .m(2) 5161 .n(n) 5162 .k(8) 5163 .iterations(1) 5164 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5165 } 5166 } 5167 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_lt_8)5168 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_lt_8) { 5169 TEST_REQUIRES_ARM_NEON_DOT; 5170 for (size_t k = 1; k < 8; k++) { 5171 GemmMicrokernelTester() 5172 .mr(2) 5173 .nr(16) 5174 .kr(4) 5175 .sr(1) 5176 .m(2) 5177 .n(16) 5178 .k(k) 5179 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5180 } 5181 } 5182 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_lt_8_strided_a)5183 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_lt_8_strided_a) { 5184 TEST_REQUIRES_ARM_NEON_DOT; 5185 for (size_t k = 1; k < 8; k++) { 5186 GemmMicrokernelTester() 5187 .mr(2) 5188 .nr(16) 5189 .kr(4) 5190 .sr(1) 5191 .m(2) 5192 .n(16) 5193 .k(k) 5194 .a_stride(11) 5195 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5196 } 5197 } 5198 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_lt_8_subtile)5199 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_lt_8_subtile) { 5200 TEST_REQUIRES_ARM_NEON_DOT; 5201 for (size_t k = 1; k < 8; k++) { 5202 for (uint32_t n = 1; n <= 16; n++) { 5203 for (uint32_t m = 1; m <= 2; m++) { 5204 GemmMicrokernelTester() 5205 .mr(2) 5206 .nr(16) 5207 .kr(4) 5208 .sr(1) 5209 .m(m) 5210 .n(n) 5211 .k(k) 5212 .iterations(1) 5213 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5214 } 5215 } 5216 } 5217 } 5218 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_gt_8)5219 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_gt_8) { 5220 TEST_REQUIRES_ARM_NEON_DOT; 5221 for (size_t k = 9; k < 16; k++) { 5222 GemmMicrokernelTester() 5223 .mr(2) 5224 .nr(16) 5225 .kr(4) 5226 .sr(1) 5227 .m(2) 5228 .n(16) 5229 .k(k) 5230 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5231 } 5232 } 5233 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_gt_8_strided_a)5234 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_gt_8_strided_a) { 5235 TEST_REQUIRES_ARM_NEON_DOT; 5236 for (size_t k = 9; k < 16; k++) { 5237 GemmMicrokernelTester() 5238 .mr(2) 5239 .nr(16) 5240 .kr(4) 5241 .sr(1) 5242 .m(2) 5243 .n(16) 5244 .k(k) 5245 .a_stride(19) 5246 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5247 } 5248 } 5249 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_gt_8_subtile)5250 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_gt_8_subtile) { 5251 TEST_REQUIRES_ARM_NEON_DOT; 5252 for (size_t k = 9; k < 16; k++) { 5253 for (uint32_t n = 1; n <= 16; n++) { 5254 for (uint32_t m = 1; m <= 2; m++) { 5255 GemmMicrokernelTester() 5256 .mr(2) 5257 .nr(16) 5258 .kr(4) 5259 .sr(1) 5260 .m(m) 5261 .n(n) 5262 .k(k) 5263 .iterations(1) 5264 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5265 } 5266 } 5267 } 5268 } 5269 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_div_8)5270 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_div_8) { 5271 TEST_REQUIRES_ARM_NEON_DOT; 5272 for (size_t k = 16; k <= 80; k += 8) { 5273 GemmMicrokernelTester() 5274 .mr(2) 5275 .nr(16) 5276 .kr(4) 5277 .sr(1) 5278 .m(2) 5279 .n(16) 5280 .k(k) 5281 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5282 } 5283 } 5284 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_div_8_strided_a)5285 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_div_8_strided_a) { 5286 TEST_REQUIRES_ARM_NEON_DOT; 5287 for (size_t k = 16; k <= 80; k += 8) { 5288 GemmMicrokernelTester() 5289 .mr(2) 5290 .nr(16) 5291 .kr(4) 5292 .sr(1) 5293 .m(2) 5294 .n(16) 5295 .k(k) 5296 .a_stride(83) 5297 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5298 } 5299 } 5300 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,k_div_8_subtile)5301 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, k_div_8_subtile) { 5302 TEST_REQUIRES_ARM_NEON_DOT; 5303 for (size_t k = 16; k <= 80; k += 8) { 5304 for (uint32_t n = 1; n <= 16; n++) { 5305 for (uint32_t m = 1; m <= 2; m++) { 5306 GemmMicrokernelTester() 5307 .mr(2) 5308 .nr(16) 5309 .kr(4) 5310 .sr(1) 5311 .m(m) 5312 .n(n) 5313 .k(k) 5314 .iterations(1) 5315 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5316 } 5317 } 5318 } 5319 } 5320 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,n_gt_16)5321 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, n_gt_16) { 5322 TEST_REQUIRES_ARM_NEON_DOT; 5323 for (uint32_t n = 17; n < 32; n++) { 5324 for (size_t k = 1; k <= 40; k += 9) { 5325 GemmMicrokernelTester() 5326 .mr(2) 5327 .nr(16) 5328 .kr(4) 5329 .sr(1) 5330 .m(2) 5331 .n(n) 5332 .k(k) 5333 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5334 } 5335 } 5336 } 5337 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,n_gt_16_strided_cn)5338 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, n_gt_16_strided_cn) { 5339 TEST_REQUIRES_ARM_NEON_DOT; 5340 for (uint32_t n = 17; n < 32; n++) { 5341 for (size_t k = 1; k <= 40; k += 9) { 5342 GemmMicrokernelTester() 5343 .mr(2) 5344 .nr(16) 5345 .kr(4) 5346 .sr(1) 5347 .m(2) 5348 .n(n) 5349 .k(k) 5350 .cn_stride(19) 5351 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5352 } 5353 } 5354 } 5355 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,n_gt_16_strided_a)5356 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, n_gt_16_strided_a) { 5357 TEST_REQUIRES_ARM_NEON_DOT; 5358 for (uint32_t n = 17; n < 32; n++) { 5359 for (size_t k = 1; k <= 40; k += 9) { 5360 GemmMicrokernelTester() 5361 .mr(2) 5362 .nr(16) 5363 .kr(4) 5364 .sr(1) 5365 .m(2) 5366 .n(n) 5367 .k(k) 5368 .a_stride(43) 5369 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5370 } 5371 } 5372 } 5373 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,n_gt_16_subtile)5374 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, n_gt_16_subtile) { 5375 TEST_REQUIRES_ARM_NEON_DOT; 5376 for (uint32_t n = 17; n < 32; n++) { 5377 for (size_t k = 1; k <= 40; k += 9) { 5378 for (uint32_t m = 1; m <= 2; m++) { 5379 GemmMicrokernelTester() 5380 .mr(2) 5381 .nr(16) 5382 .kr(4) 5383 .sr(1) 5384 .m(m) 5385 .n(n) 5386 .k(k) 5387 .iterations(1) 5388 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5389 } 5390 } 5391 } 5392 } 5393 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,n_div_16)5394 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, n_div_16) { 5395 TEST_REQUIRES_ARM_NEON_DOT; 5396 for (uint32_t n = 32; n <= 48; n += 16) { 5397 for (size_t k = 1; k <= 40; k += 9) { 5398 GemmMicrokernelTester() 5399 .mr(2) 5400 .nr(16) 5401 .kr(4) 5402 .sr(1) 5403 .m(2) 5404 .n(n) 5405 .k(k) 5406 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5407 } 5408 } 5409 } 5410 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,n_div_16_strided_cn)5411 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, n_div_16_strided_cn) { 5412 TEST_REQUIRES_ARM_NEON_DOT; 5413 for (uint32_t n = 32; n <= 48; n += 16) { 5414 for (size_t k = 1; k <= 40; k += 9) { 5415 GemmMicrokernelTester() 5416 .mr(2) 5417 .nr(16) 5418 .kr(4) 5419 .sr(1) 5420 .m(2) 5421 .n(n) 5422 .k(k) 5423 .cn_stride(19) 5424 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5425 } 5426 } 5427 } 5428 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,n_div_16_strided_a)5429 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, n_div_16_strided_a) { 5430 TEST_REQUIRES_ARM_NEON_DOT; 5431 for (uint32_t n = 32; n <= 48; n += 16) { 5432 for (size_t k = 1; k <= 40; k += 9) { 5433 GemmMicrokernelTester() 5434 .mr(2) 5435 .nr(16) 5436 .kr(4) 5437 .sr(1) 5438 .m(2) 5439 .n(n) 5440 .k(k) 5441 .a_stride(43) 5442 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5443 } 5444 } 5445 } 5446 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,n_div_16_subtile)5447 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, n_div_16_subtile) { 5448 TEST_REQUIRES_ARM_NEON_DOT; 5449 for (uint32_t n = 32; n <= 48; n += 16) { 5450 for (size_t k = 1; k <= 40; k += 9) { 5451 for (uint32_t m = 1; m <= 2; m++) { 5452 GemmMicrokernelTester() 5453 .mr(2) 5454 .nr(16) 5455 .kr(4) 5456 .sr(1) 5457 .m(m) 5458 .n(n) 5459 .k(k) 5460 .iterations(1) 5461 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5462 } 5463 } 5464 } 5465 } 5466 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,strided_cm_subtile)5467 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, strided_cm_subtile) { 5468 TEST_REQUIRES_ARM_NEON_DOT; 5469 for (size_t k = 1; k <= 40; k += 9) { 5470 for (uint32_t n = 1; n <= 16; n++) { 5471 for (uint32_t m = 1; m <= 2; m++) { 5472 GemmMicrokernelTester() 5473 .mr(2) 5474 .nr(16) 5475 .kr(4) 5476 .sr(1) 5477 .m(m) 5478 .n(n) 5479 .k(k) 5480 .cm_stride(19) 5481 .iterations(1) 5482 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5483 } 5484 } 5485 } 5486 } 5487 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,qmin)5488 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, qmin) { 5489 TEST_REQUIRES_ARM_NEON_DOT; 5490 GemmMicrokernelTester() 5491 .mr(2) 5492 .nr(16) 5493 .kr(4) 5494 .sr(1) 5495 .m(2) 5496 .n(16) 5497 .k(8) 5498 .qmin(128) 5499 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5500 } 5501 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,qmax)5502 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, qmax) { 5503 TEST_REQUIRES_ARM_NEON_DOT; 5504 GemmMicrokernelTester() 5505 .mr(2) 5506 .nr(16) 5507 .kr(4) 5508 .sr(1) 5509 .m(2) 5510 .n(16) 5511 .k(8) 5512 .qmax(128) 5513 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5514 } 5515 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,strided_cm)5516 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, strided_cm) { 5517 TEST_REQUIRES_ARM_NEON_DOT; 5518 GemmMicrokernelTester() 5519 .mr(2) 5520 .nr(16) 5521 .kr(4) 5522 .sr(1) 5523 .m(2) 5524 .n(16) 5525 .k(8) 5526 .cm_stride(19) 5527 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5528 } 5529 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,no_a_zero_point)5530 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, no_a_zero_point) { 5531 TEST_REQUIRES_ARM_NEON_DOT; 5532 for (size_t k = 1; k <= 40; k += 9) { 5533 GemmMicrokernelTester() 5534 .mr(2) 5535 .nr(16) 5536 .kr(4) 5537 .sr(1) 5538 .m(2) 5539 .n(16) 5540 .k(k) 5541 .a_zero_point(0) 5542 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5543 } 5544 } 5545 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,no_b_zero_point)5546 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, no_b_zero_point) { 5547 TEST_REQUIRES_ARM_NEON_DOT; 5548 for (size_t k = 1; k <= 40; k += 9) { 5549 GemmMicrokernelTester() 5550 .mr(2) 5551 .nr(16) 5552 .kr(4) 5553 .sr(1) 5554 .m(2) 5555 .n(16) 5556 .k(k) 5557 .b_zero_point(0) 5558 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5559 } 5560 } 5561 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT,no_zero_point)5562 TEST(QU8_GEMM_MINMAX_RNDNU_2X16C4__NEONDOT, no_zero_point) { 5563 TEST_REQUIRES_ARM_NEON_DOT; 5564 for (size_t k = 1; k <= 40; k += 9) { 5565 GemmMicrokernelTester() 5566 .mr(2) 5567 .nr(16) 5568 .kr(4) 5569 .sr(1) 5570 .m(2) 5571 .n(16) 5572 .k(k) 5573 .a_zero_point(0) 5574 .b_zero_point(0) 5575 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5576 } 5577 } 5578 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 5579 5580 5581 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_eq_8)5582 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_eq_8) { 5583 TEST_REQUIRES_ARM_NEON_DOT; 5584 GemmMicrokernelTester() 5585 .mr(3) 5586 .nr(16) 5587 .kr(4) 5588 .sr(1) 5589 .m(3) 5590 .n(16) 5591 .k(8) 5592 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5593 } 5594 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,strided_cn)5595 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, strided_cn) { 5596 TEST_REQUIRES_ARM_NEON_DOT; 5597 GemmMicrokernelTester() 5598 .mr(3) 5599 .nr(16) 5600 .kr(4) 5601 .sr(1) 5602 .m(3) 5603 .n(16) 5604 .k(8) 5605 .cn_stride(19) 5606 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5607 } 5608 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_eq_8_strided_a)5609 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_eq_8_strided_a) { 5610 TEST_REQUIRES_ARM_NEON_DOT; 5611 GemmMicrokernelTester() 5612 .mr(3) 5613 .nr(16) 5614 .kr(4) 5615 .sr(1) 5616 .m(3) 5617 .n(16) 5618 .k(8) 5619 .a_stride(11) 5620 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5621 } 5622 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_eq_8_subtile)5623 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_eq_8_subtile) { 5624 TEST_REQUIRES_ARM_NEON_DOT; 5625 for (uint32_t n = 1; n <= 16; n++) { 5626 for (uint32_t m = 1; m <= 3; m++) { 5627 GemmMicrokernelTester() 5628 .mr(3) 5629 .nr(16) 5630 .kr(4) 5631 .sr(1) 5632 .m(m) 5633 .n(n) 5634 .k(8) 5635 .iterations(1) 5636 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5637 } 5638 } 5639 } 5640 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_eq_8_subtile_m)5641 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_eq_8_subtile_m) { 5642 TEST_REQUIRES_ARM_NEON_DOT; 5643 for (uint32_t m = 1; m <= 3; m++) { 5644 GemmMicrokernelTester() 5645 .mr(3) 5646 .nr(16) 5647 .kr(4) 5648 .sr(1) 5649 .m(m) 5650 .n(16) 5651 .k(8) 5652 .iterations(1) 5653 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5654 } 5655 } 5656 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_eq_8_subtile_n)5657 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_eq_8_subtile_n) { 5658 TEST_REQUIRES_ARM_NEON_DOT; 5659 for (uint32_t n = 1; n <= 16; n++) { 5660 GemmMicrokernelTester() 5661 .mr(3) 5662 .nr(16) 5663 .kr(4) 5664 .sr(1) 5665 .m(3) 5666 .n(n) 5667 .k(8) 5668 .iterations(1) 5669 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5670 } 5671 } 5672 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_lt_8)5673 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_lt_8) { 5674 TEST_REQUIRES_ARM_NEON_DOT; 5675 for (size_t k = 1; k < 8; k++) { 5676 GemmMicrokernelTester() 5677 .mr(3) 5678 .nr(16) 5679 .kr(4) 5680 .sr(1) 5681 .m(3) 5682 .n(16) 5683 .k(k) 5684 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5685 } 5686 } 5687 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_lt_8_strided_a)5688 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_lt_8_strided_a) { 5689 TEST_REQUIRES_ARM_NEON_DOT; 5690 for (size_t k = 1; k < 8; k++) { 5691 GemmMicrokernelTester() 5692 .mr(3) 5693 .nr(16) 5694 .kr(4) 5695 .sr(1) 5696 .m(3) 5697 .n(16) 5698 .k(k) 5699 .a_stride(11) 5700 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5701 } 5702 } 5703 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_lt_8_subtile)5704 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_lt_8_subtile) { 5705 TEST_REQUIRES_ARM_NEON_DOT; 5706 for (size_t k = 1; k < 8; k++) { 5707 for (uint32_t n = 1; n <= 16; n++) { 5708 for (uint32_t m = 1; m <= 3; m++) { 5709 GemmMicrokernelTester() 5710 .mr(3) 5711 .nr(16) 5712 .kr(4) 5713 .sr(1) 5714 .m(m) 5715 .n(n) 5716 .k(k) 5717 .iterations(1) 5718 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5719 } 5720 } 5721 } 5722 } 5723 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_gt_8)5724 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_gt_8) { 5725 TEST_REQUIRES_ARM_NEON_DOT; 5726 for (size_t k = 9; k < 16; k++) { 5727 GemmMicrokernelTester() 5728 .mr(3) 5729 .nr(16) 5730 .kr(4) 5731 .sr(1) 5732 .m(3) 5733 .n(16) 5734 .k(k) 5735 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5736 } 5737 } 5738 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_gt_8_strided_a)5739 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_gt_8_strided_a) { 5740 TEST_REQUIRES_ARM_NEON_DOT; 5741 for (size_t k = 9; k < 16; k++) { 5742 GemmMicrokernelTester() 5743 .mr(3) 5744 .nr(16) 5745 .kr(4) 5746 .sr(1) 5747 .m(3) 5748 .n(16) 5749 .k(k) 5750 .a_stride(19) 5751 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5752 } 5753 } 5754 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_gt_8_subtile)5755 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_gt_8_subtile) { 5756 TEST_REQUIRES_ARM_NEON_DOT; 5757 for (size_t k = 9; k < 16; k++) { 5758 for (uint32_t n = 1; n <= 16; n++) { 5759 for (uint32_t m = 1; m <= 3; m++) { 5760 GemmMicrokernelTester() 5761 .mr(3) 5762 .nr(16) 5763 .kr(4) 5764 .sr(1) 5765 .m(m) 5766 .n(n) 5767 .k(k) 5768 .iterations(1) 5769 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5770 } 5771 } 5772 } 5773 } 5774 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_div_8)5775 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_div_8) { 5776 TEST_REQUIRES_ARM_NEON_DOT; 5777 for (size_t k = 16; k <= 80; k += 8) { 5778 GemmMicrokernelTester() 5779 .mr(3) 5780 .nr(16) 5781 .kr(4) 5782 .sr(1) 5783 .m(3) 5784 .n(16) 5785 .k(k) 5786 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5787 } 5788 } 5789 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_div_8_strided_a)5790 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_div_8_strided_a) { 5791 TEST_REQUIRES_ARM_NEON_DOT; 5792 for (size_t k = 16; k <= 80; k += 8) { 5793 GemmMicrokernelTester() 5794 .mr(3) 5795 .nr(16) 5796 .kr(4) 5797 .sr(1) 5798 .m(3) 5799 .n(16) 5800 .k(k) 5801 .a_stride(83) 5802 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5803 } 5804 } 5805 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,k_div_8_subtile)5806 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, k_div_8_subtile) { 5807 TEST_REQUIRES_ARM_NEON_DOT; 5808 for (size_t k = 16; k <= 80; k += 8) { 5809 for (uint32_t n = 1; n <= 16; n++) { 5810 for (uint32_t m = 1; m <= 3; m++) { 5811 GemmMicrokernelTester() 5812 .mr(3) 5813 .nr(16) 5814 .kr(4) 5815 .sr(1) 5816 .m(m) 5817 .n(n) 5818 .k(k) 5819 .iterations(1) 5820 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5821 } 5822 } 5823 } 5824 } 5825 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,n_gt_16)5826 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, n_gt_16) { 5827 TEST_REQUIRES_ARM_NEON_DOT; 5828 for (uint32_t n = 17; n < 32; n++) { 5829 for (size_t k = 1; k <= 40; k += 9) { 5830 GemmMicrokernelTester() 5831 .mr(3) 5832 .nr(16) 5833 .kr(4) 5834 .sr(1) 5835 .m(3) 5836 .n(n) 5837 .k(k) 5838 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5839 } 5840 } 5841 } 5842 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,n_gt_16_strided_cn)5843 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, n_gt_16_strided_cn) { 5844 TEST_REQUIRES_ARM_NEON_DOT; 5845 for (uint32_t n = 17; n < 32; n++) { 5846 for (size_t k = 1; k <= 40; k += 9) { 5847 GemmMicrokernelTester() 5848 .mr(3) 5849 .nr(16) 5850 .kr(4) 5851 .sr(1) 5852 .m(3) 5853 .n(n) 5854 .k(k) 5855 .cn_stride(19) 5856 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5857 } 5858 } 5859 } 5860 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,n_gt_16_strided_a)5861 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, n_gt_16_strided_a) { 5862 TEST_REQUIRES_ARM_NEON_DOT; 5863 for (uint32_t n = 17; n < 32; n++) { 5864 for (size_t k = 1; k <= 40; k += 9) { 5865 GemmMicrokernelTester() 5866 .mr(3) 5867 .nr(16) 5868 .kr(4) 5869 .sr(1) 5870 .m(3) 5871 .n(n) 5872 .k(k) 5873 .a_stride(43) 5874 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5875 } 5876 } 5877 } 5878 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,n_gt_16_subtile)5879 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, n_gt_16_subtile) { 5880 TEST_REQUIRES_ARM_NEON_DOT; 5881 for (uint32_t n = 17; n < 32; n++) { 5882 for (size_t k = 1; k <= 40; k += 9) { 5883 for (uint32_t m = 1; m <= 3; m++) { 5884 GemmMicrokernelTester() 5885 .mr(3) 5886 .nr(16) 5887 .kr(4) 5888 .sr(1) 5889 .m(m) 5890 .n(n) 5891 .k(k) 5892 .iterations(1) 5893 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5894 } 5895 } 5896 } 5897 } 5898 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,n_div_16)5899 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, n_div_16) { 5900 TEST_REQUIRES_ARM_NEON_DOT; 5901 for (uint32_t n = 32; n <= 48; n += 16) { 5902 for (size_t k = 1; k <= 40; k += 9) { 5903 GemmMicrokernelTester() 5904 .mr(3) 5905 .nr(16) 5906 .kr(4) 5907 .sr(1) 5908 .m(3) 5909 .n(n) 5910 .k(k) 5911 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5912 } 5913 } 5914 } 5915 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,n_div_16_strided_cn)5916 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, n_div_16_strided_cn) { 5917 TEST_REQUIRES_ARM_NEON_DOT; 5918 for (uint32_t n = 32; n <= 48; n += 16) { 5919 for (size_t k = 1; k <= 40; k += 9) { 5920 GemmMicrokernelTester() 5921 .mr(3) 5922 .nr(16) 5923 .kr(4) 5924 .sr(1) 5925 .m(3) 5926 .n(n) 5927 .k(k) 5928 .cn_stride(19) 5929 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5930 } 5931 } 5932 } 5933 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,n_div_16_strided_a)5934 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, n_div_16_strided_a) { 5935 TEST_REQUIRES_ARM_NEON_DOT; 5936 for (uint32_t n = 32; n <= 48; n += 16) { 5937 for (size_t k = 1; k <= 40; k += 9) { 5938 GemmMicrokernelTester() 5939 .mr(3) 5940 .nr(16) 5941 .kr(4) 5942 .sr(1) 5943 .m(3) 5944 .n(n) 5945 .k(k) 5946 .a_stride(43) 5947 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5948 } 5949 } 5950 } 5951 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,n_div_16_subtile)5952 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, n_div_16_subtile) { 5953 TEST_REQUIRES_ARM_NEON_DOT; 5954 for (uint32_t n = 32; n <= 48; n += 16) { 5955 for (size_t k = 1; k <= 40; k += 9) { 5956 for (uint32_t m = 1; m <= 3; m++) { 5957 GemmMicrokernelTester() 5958 .mr(3) 5959 .nr(16) 5960 .kr(4) 5961 .sr(1) 5962 .m(m) 5963 .n(n) 5964 .k(k) 5965 .iterations(1) 5966 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5967 } 5968 } 5969 } 5970 } 5971 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,strided_cm_subtile)5972 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, strided_cm_subtile) { 5973 TEST_REQUIRES_ARM_NEON_DOT; 5974 for (size_t k = 1; k <= 40; k += 9) { 5975 for (uint32_t n = 1; n <= 16; n++) { 5976 for (uint32_t m = 1; m <= 3; m++) { 5977 GemmMicrokernelTester() 5978 .mr(3) 5979 .nr(16) 5980 .kr(4) 5981 .sr(1) 5982 .m(m) 5983 .n(n) 5984 .k(k) 5985 .cm_stride(19) 5986 .iterations(1) 5987 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 5988 } 5989 } 5990 } 5991 } 5992 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,qmin)5993 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, qmin) { 5994 TEST_REQUIRES_ARM_NEON_DOT; 5995 GemmMicrokernelTester() 5996 .mr(3) 5997 .nr(16) 5998 .kr(4) 5999 .sr(1) 6000 .m(3) 6001 .n(16) 6002 .k(8) 6003 .qmin(128) 6004 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6005 } 6006 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,qmax)6007 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, qmax) { 6008 TEST_REQUIRES_ARM_NEON_DOT; 6009 GemmMicrokernelTester() 6010 .mr(3) 6011 .nr(16) 6012 .kr(4) 6013 .sr(1) 6014 .m(3) 6015 .n(16) 6016 .k(8) 6017 .qmax(128) 6018 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6019 } 6020 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,strided_cm)6021 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, strided_cm) { 6022 TEST_REQUIRES_ARM_NEON_DOT; 6023 GemmMicrokernelTester() 6024 .mr(3) 6025 .nr(16) 6026 .kr(4) 6027 .sr(1) 6028 .m(3) 6029 .n(16) 6030 .k(8) 6031 .cm_stride(19) 6032 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6033 } 6034 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,no_a_zero_point)6035 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, no_a_zero_point) { 6036 TEST_REQUIRES_ARM_NEON_DOT; 6037 for (size_t k = 1; k <= 40; k += 9) { 6038 GemmMicrokernelTester() 6039 .mr(3) 6040 .nr(16) 6041 .kr(4) 6042 .sr(1) 6043 .m(3) 6044 .n(16) 6045 .k(k) 6046 .a_zero_point(0) 6047 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6048 } 6049 } 6050 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,no_b_zero_point)6051 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, no_b_zero_point) { 6052 TEST_REQUIRES_ARM_NEON_DOT; 6053 for (size_t k = 1; k <= 40; k += 9) { 6054 GemmMicrokernelTester() 6055 .mr(3) 6056 .nr(16) 6057 .kr(4) 6058 .sr(1) 6059 .m(3) 6060 .n(16) 6061 .k(k) 6062 .b_zero_point(0) 6063 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6064 } 6065 } 6066 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT,no_zero_point)6067 TEST(QU8_GEMM_MINMAX_RNDNU_3X16C4__NEONDOT, no_zero_point) { 6068 TEST_REQUIRES_ARM_NEON_DOT; 6069 for (size_t k = 1; k <= 40; k += 9) { 6070 GemmMicrokernelTester() 6071 .mr(3) 6072 .nr(16) 6073 .kr(4) 6074 .sr(1) 6075 .m(3) 6076 .n(16) 6077 .k(k) 6078 .a_zero_point(0) 6079 .b_zero_point(0) 6080 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6081 } 6082 } 6083 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 6084 6085 6086 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_eq_8)6087 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8) { 6088 TEST_REQUIRES_ARM_NEON_DOT; 6089 GemmMicrokernelTester() 6090 .mr(4) 6091 .nr(16) 6092 .kr(4) 6093 .sr(1) 6094 .m(4) 6095 .n(16) 6096 .k(8) 6097 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6098 } 6099 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,strided_cn)6100 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, strided_cn) { 6101 TEST_REQUIRES_ARM_NEON_DOT; 6102 GemmMicrokernelTester() 6103 .mr(4) 6104 .nr(16) 6105 .kr(4) 6106 .sr(1) 6107 .m(4) 6108 .n(16) 6109 .k(8) 6110 .cn_stride(19) 6111 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6112 } 6113 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_eq_8_strided_a)6114 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8_strided_a) { 6115 TEST_REQUIRES_ARM_NEON_DOT; 6116 GemmMicrokernelTester() 6117 .mr(4) 6118 .nr(16) 6119 .kr(4) 6120 .sr(1) 6121 .m(4) 6122 .n(16) 6123 .k(8) 6124 .a_stride(11) 6125 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6126 } 6127 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_eq_8_subtile)6128 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8_subtile) { 6129 TEST_REQUIRES_ARM_NEON_DOT; 6130 for (uint32_t n = 1; n <= 16; n++) { 6131 for (uint32_t m = 1; m <= 4; m++) { 6132 GemmMicrokernelTester() 6133 .mr(4) 6134 .nr(16) 6135 .kr(4) 6136 .sr(1) 6137 .m(m) 6138 .n(n) 6139 .k(8) 6140 .iterations(1) 6141 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6142 } 6143 } 6144 } 6145 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_eq_8_subtile_m)6146 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8_subtile_m) { 6147 TEST_REQUIRES_ARM_NEON_DOT; 6148 for (uint32_t m = 1; m <= 4; m++) { 6149 GemmMicrokernelTester() 6150 .mr(4) 6151 .nr(16) 6152 .kr(4) 6153 .sr(1) 6154 .m(m) 6155 .n(16) 6156 .k(8) 6157 .iterations(1) 6158 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6159 } 6160 } 6161 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_eq_8_subtile_n)6162 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_eq_8_subtile_n) { 6163 TEST_REQUIRES_ARM_NEON_DOT; 6164 for (uint32_t n = 1; n <= 16; n++) { 6165 GemmMicrokernelTester() 6166 .mr(4) 6167 .nr(16) 6168 .kr(4) 6169 .sr(1) 6170 .m(4) 6171 .n(n) 6172 .k(8) 6173 .iterations(1) 6174 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6175 } 6176 } 6177 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_lt_8)6178 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_lt_8) { 6179 TEST_REQUIRES_ARM_NEON_DOT; 6180 for (size_t k = 1; k < 8; k++) { 6181 GemmMicrokernelTester() 6182 .mr(4) 6183 .nr(16) 6184 .kr(4) 6185 .sr(1) 6186 .m(4) 6187 .n(16) 6188 .k(k) 6189 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6190 } 6191 } 6192 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_lt_8_strided_a)6193 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_lt_8_strided_a) { 6194 TEST_REQUIRES_ARM_NEON_DOT; 6195 for (size_t k = 1; k < 8; k++) { 6196 GemmMicrokernelTester() 6197 .mr(4) 6198 .nr(16) 6199 .kr(4) 6200 .sr(1) 6201 .m(4) 6202 .n(16) 6203 .k(k) 6204 .a_stride(11) 6205 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6206 } 6207 } 6208 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_lt_8_subtile)6209 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_lt_8_subtile) { 6210 TEST_REQUIRES_ARM_NEON_DOT; 6211 for (size_t k = 1; k < 8; k++) { 6212 for (uint32_t n = 1; n <= 16; n++) { 6213 for (uint32_t m = 1; m <= 4; m++) { 6214 GemmMicrokernelTester() 6215 .mr(4) 6216 .nr(16) 6217 .kr(4) 6218 .sr(1) 6219 .m(m) 6220 .n(n) 6221 .k(k) 6222 .iterations(1) 6223 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6224 } 6225 } 6226 } 6227 } 6228 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_gt_8)6229 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_gt_8) { 6230 TEST_REQUIRES_ARM_NEON_DOT; 6231 for (size_t k = 9; k < 16; k++) { 6232 GemmMicrokernelTester() 6233 .mr(4) 6234 .nr(16) 6235 .kr(4) 6236 .sr(1) 6237 .m(4) 6238 .n(16) 6239 .k(k) 6240 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6241 } 6242 } 6243 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_gt_8_strided_a)6244 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_gt_8_strided_a) { 6245 TEST_REQUIRES_ARM_NEON_DOT; 6246 for (size_t k = 9; k < 16; k++) { 6247 GemmMicrokernelTester() 6248 .mr(4) 6249 .nr(16) 6250 .kr(4) 6251 .sr(1) 6252 .m(4) 6253 .n(16) 6254 .k(k) 6255 .a_stride(19) 6256 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6257 } 6258 } 6259 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_gt_8_subtile)6260 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_gt_8_subtile) { 6261 TEST_REQUIRES_ARM_NEON_DOT; 6262 for (size_t k = 9; k < 16; k++) { 6263 for (uint32_t n = 1; n <= 16; n++) { 6264 for (uint32_t m = 1; m <= 4; m++) { 6265 GemmMicrokernelTester() 6266 .mr(4) 6267 .nr(16) 6268 .kr(4) 6269 .sr(1) 6270 .m(m) 6271 .n(n) 6272 .k(k) 6273 .iterations(1) 6274 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6275 } 6276 } 6277 } 6278 } 6279 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_div_8)6280 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_div_8) { 6281 TEST_REQUIRES_ARM_NEON_DOT; 6282 for (size_t k = 16; k <= 80; k += 8) { 6283 GemmMicrokernelTester() 6284 .mr(4) 6285 .nr(16) 6286 .kr(4) 6287 .sr(1) 6288 .m(4) 6289 .n(16) 6290 .k(k) 6291 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6292 } 6293 } 6294 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_div_8_strided_a)6295 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_div_8_strided_a) { 6296 TEST_REQUIRES_ARM_NEON_DOT; 6297 for (size_t k = 16; k <= 80; k += 8) { 6298 GemmMicrokernelTester() 6299 .mr(4) 6300 .nr(16) 6301 .kr(4) 6302 .sr(1) 6303 .m(4) 6304 .n(16) 6305 .k(k) 6306 .a_stride(83) 6307 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6308 } 6309 } 6310 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,k_div_8_subtile)6311 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, k_div_8_subtile) { 6312 TEST_REQUIRES_ARM_NEON_DOT; 6313 for (size_t k = 16; k <= 80; k += 8) { 6314 for (uint32_t n = 1; n <= 16; n++) { 6315 for (uint32_t m = 1; m <= 4; m++) { 6316 GemmMicrokernelTester() 6317 .mr(4) 6318 .nr(16) 6319 .kr(4) 6320 .sr(1) 6321 .m(m) 6322 .n(n) 6323 .k(k) 6324 .iterations(1) 6325 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6326 } 6327 } 6328 } 6329 } 6330 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,n_gt_16)6331 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_gt_16) { 6332 TEST_REQUIRES_ARM_NEON_DOT; 6333 for (uint32_t n = 17; n < 32; n++) { 6334 for (size_t k = 1; k <= 40; k += 9) { 6335 GemmMicrokernelTester() 6336 .mr(4) 6337 .nr(16) 6338 .kr(4) 6339 .sr(1) 6340 .m(4) 6341 .n(n) 6342 .k(k) 6343 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6344 } 6345 } 6346 } 6347 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,n_gt_16_strided_cn)6348 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_gt_16_strided_cn) { 6349 TEST_REQUIRES_ARM_NEON_DOT; 6350 for (uint32_t n = 17; n < 32; n++) { 6351 for (size_t k = 1; k <= 40; k += 9) { 6352 GemmMicrokernelTester() 6353 .mr(4) 6354 .nr(16) 6355 .kr(4) 6356 .sr(1) 6357 .m(4) 6358 .n(n) 6359 .k(k) 6360 .cn_stride(19) 6361 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6362 } 6363 } 6364 } 6365 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,n_gt_16_strided_a)6366 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_gt_16_strided_a) { 6367 TEST_REQUIRES_ARM_NEON_DOT; 6368 for (uint32_t n = 17; n < 32; n++) { 6369 for (size_t k = 1; k <= 40; k += 9) { 6370 GemmMicrokernelTester() 6371 .mr(4) 6372 .nr(16) 6373 .kr(4) 6374 .sr(1) 6375 .m(4) 6376 .n(n) 6377 .k(k) 6378 .a_stride(43) 6379 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6380 } 6381 } 6382 } 6383 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,n_gt_16_subtile)6384 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_gt_16_subtile) { 6385 TEST_REQUIRES_ARM_NEON_DOT; 6386 for (uint32_t n = 17; n < 32; n++) { 6387 for (size_t k = 1; k <= 40; k += 9) { 6388 for (uint32_t m = 1; m <= 4; m++) { 6389 GemmMicrokernelTester() 6390 .mr(4) 6391 .nr(16) 6392 .kr(4) 6393 .sr(1) 6394 .m(m) 6395 .n(n) 6396 .k(k) 6397 .iterations(1) 6398 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6399 } 6400 } 6401 } 6402 } 6403 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,n_div_16)6404 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_div_16) { 6405 TEST_REQUIRES_ARM_NEON_DOT; 6406 for (uint32_t n = 32; n <= 48; n += 16) { 6407 for (size_t k = 1; k <= 40; k += 9) { 6408 GemmMicrokernelTester() 6409 .mr(4) 6410 .nr(16) 6411 .kr(4) 6412 .sr(1) 6413 .m(4) 6414 .n(n) 6415 .k(k) 6416 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6417 } 6418 } 6419 } 6420 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,n_div_16_strided_cn)6421 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_div_16_strided_cn) { 6422 TEST_REQUIRES_ARM_NEON_DOT; 6423 for (uint32_t n = 32; n <= 48; n += 16) { 6424 for (size_t k = 1; k <= 40; k += 9) { 6425 GemmMicrokernelTester() 6426 .mr(4) 6427 .nr(16) 6428 .kr(4) 6429 .sr(1) 6430 .m(4) 6431 .n(n) 6432 .k(k) 6433 .cn_stride(19) 6434 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6435 } 6436 } 6437 } 6438 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,n_div_16_strided_a)6439 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_div_16_strided_a) { 6440 TEST_REQUIRES_ARM_NEON_DOT; 6441 for (uint32_t n = 32; n <= 48; n += 16) { 6442 for (size_t k = 1; k <= 40; k += 9) { 6443 GemmMicrokernelTester() 6444 .mr(4) 6445 .nr(16) 6446 .kr(4) 6447 .sr(1) 6448 .m(4) 6449 .n(n) 6450 .k(k) 6451 .a_stride(43) 6452 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6453 } 6454 } 6455 } 6456 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,n_div_16_subtile)6457 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, n_div_16_subtile) { 6458 TEST_REQUIRES_ARM_NEON_DOT; 6459 for (uint32_t n = 32; n <= 48; n += 16) { 6460 for (size_t k = 1; k <= 40; k += 9) { 6461 for (uint32_t m = 1; m <= 4; m++) { 6462 GemmMicrokernelTester() 6463 .mr(4) 6464 .nr(16) 6465 .kr(4) 6466 .sr(1) 6467 .m(m) 6468 .n(n) 6469 .k(k) 6470 .iterations(1) 6471 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6472 } 6473 } 6474 } 6475 } 6476 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,strided_cm_subtile)6477 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, strided_cm_subtile) { 6478 TEST_REQUIRES_ARM_NEON_DOT; 6479 for (size_t k = 1; k <= 40; k += 9) { 6480 for (uint32_t n = 1; n <= 16; n++) { 6481 for (uint32_t m = 1; m <= 4; m++) { 6482 GemmMicrokernelTester() 6483 .mr(4) 6484 .nr(16) 6485 .kr(4) 6486 .sr(1) 6487 .m(m) 6488 .n(n) 6489 .k(k) 6490 .cm_stride(19) 6491 .iterations(1) 6492 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6493 } 6494 } 6495 } 6496 } 6497 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,qmin)6498 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, qmin) { 6499 TEST_REQUIRES_ARM_NEON_DOT; 6500 GemmMicrokernelTester() 6501 .mr(4) 6502 .nr(16) 6503 .kr(4) 6504 .sr(1) 6505 .m(4) 6506 .n(16) 6507 .k(8) 6508 .qmin(128) 6509 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6510 } 6511 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,qmax)6512 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, qmax) { 6513 TEST_REQUIRES_ARM_NEON_DOT; 6514 GemmMicrokernelTester() 6515 .mr(4) 6516 .nr(16) 6517 .kr(4) 6518 .sr(1) 6519 .m(4) 6520 .n(16) 6521 .k(8) 6522 .qmax(128) 6523 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6524 } 6525 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,strided_cm)6526 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, strided_cm) { 6527 TEST_REQUIRES_ARM_NEON_DOT; 6528 GemmMicrokernelTester() 6529 .mr(4) 6530 .nr(16) 6531 .kr(4) 6532 .sr(1) 6533 .m(4) 6534 .n(16) 6535 .k(8) 6536 .cm_stride(19) 6537 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6538 } 6539 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,no_a_zero_point)6540 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, no_a_zero_point) { 6541 TEST_REQUIRES_ARM_NEON_DOT; 6542 for (size_t k = 1; k <= 40; k += 9) { 6543 GemmMicrokernelTester() 6544 .mr(4) 6545 .nr(16) 6546 .kr(4) 6547 .sr(1) 6548 .m(4) 6549 .n(16) 6550 .k(k) 6551 .a_zero_point(0) 6552 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6553 } 6554 } 6555 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,no_b_zero_point)6556 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, no_b_zero_point) { 6557 TEST_REQUIRES_ARM_NEON_DOT; 6558 for (size_t k = 1; k <= 40; k += 9) { 6559 GemmMicrokernelTester() 6560 .mr(4) 6561 .nr(16) 6562 .kr(4) 6563 .sr(1) 6564 .m(4) 6565 .n(16) 6566 .k(k) 6567 .b_zero_point(0) 6568 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6569 } 6570 } 6571 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT,no_zero_point)6572 TEST(QU8_GEMM_MINMAX_RNDNU_4X16C4__NEONDOT, no_zero_point) { 6573 TEST_REQUIRES_ARM_NEON_DOT; 6574 for (size_t k = 1; k <= 40; k += 9) { 6575 GemmMicrokernelTester() 6576 .mr(4) 6577 .nr(16) 6578 .kr(4) 6579 .sr(1) 6580 .m(4) 6581 .n(16) 6582 .k(k) 6583 .a_zero_point(0) 6584 .b_zero_point(0) 6585 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6586 } 6587 } 6588 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 6589 6590 6591 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_eq_8)6592 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_eq_8) { 6593 TEST_REQUIRES_ARM_NEON_DOT; 6594 GemmMicrokernelTester() 6595 .mr(5) 6596 .nr(16) 6597 .kr(4) 6598 .sr(1) 6599 .m(5) 6600 .n(16) 6601 .k(8) 6602 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6603 } 6604 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,strided_cn)6605 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, strided_cn) { 6606 TEST_REQUIRES_ARM_NEON_DOT; 6607 GemmMicrokernelTester() 6608 .mr(5) 6609 .nr(16) 6610 .kr(4) 6611 .sr(1) 6612 .m(5) 6613 .n(16) 6614 .k(8) 6615 .cn_stride(19) 6616 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6617 } 6618 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_eq_8_strided_a)6619 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_eq_8_strided_a) { 6620 TEST_REQUIRES_ARM_NEON_DOT; 6621 GemmMicrokernelTester() 6622 .mr(5) 6623 .nr(16) 6624 .kr(4) 6625 .sr(1) 6626 .m(5) 6627 .n(16) 6628 .k(8) 6629 .a_stride(11) 6630 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6631 } 6632 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_eq_8_subtile)6633 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_eq_8_subtile) { 6634 TEST_REQUIRES_ARM_NEON_DOT; 6635 for (uint32_t n = 1; n <= 16; n++) { 6636 for (uint32_t m = 1; m <= 5; m++) { 6637 GemmMicrokernelTester() 6638 .mr(5) 6639 .nr(16) 6640 .kr(4) 6641 .sr(1) 6642 .m(m) 6643 .n(n) 6644 .k(8) 6645 .iterations(1) 6646 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6647 } 6648 } 6649 } 6650 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_eq_8_subtile_m)6651 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_eq_8_subtile_m) { 6652 TEST_REQUIRES_ARM_NEON_DOT; 6653 for (uint32_t m = 1; m <= 5; m++) { 6654 GemmMicrokernelTester() 6655 .mr(5) 6656 .nr(16) 6657 .kr(4) 6658 .sr(1) 6659 .m(m) 6660 .n(16) 6661 .k(8) 6662 .iterations(1) 6663 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6664 } 6665 } 6666 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_eq_8_subtile_n)6667 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_eq_8_subtile_n) { 6668 TEST_REQUIRES_ARM_NEON_DOT; 6669 for (uint32_t n = 1; n <= 16; n++) { 6670 GemmMicrokernelTester() 6671 .mr(5) 6672 .nr(16) 6673 .kr(4) 6674 .sr(1) 6675 .m(5) 6676 .n(n) 6677 .k(8) 6678 .iterations(1) 6679 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6680 } 6681 } 6682 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_lt_8)6683 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_lt_8) { 6684 TEST_REQUIRES_ARM_NEON_DOT; 6685 for (size_t k = 1; k < 8; k++) { 6686 GemmMicrokernelTester() 6687 .mr(5) 6688 .nr(16) 6689 .kr(4) 6690 .sr(1) 6691 .m(5) 6692 .n(16) 6693 .k(k) 6694 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6695 } 6696 } 6697 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_lt_8_strided_a)6698 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_lt_8_strided_a) { 6699 TEST_REQUIRES_ARM_NEON_DOT; 6700 for (size_t k = 1; k < 8; k++) { 6701 GemmMicrokernelTester() 6702 .mr(5) 6703 .nr(16) 6704 .kr(4) 6705 .sr(1) 6706 .m(5) 6707 .n(16) 6708 .k(k) 6709 .a_stride(11) 6710 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6711 } 6712 } 6713 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_lt_8_subtile)6714 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_lt_8_subtile) { 6715 TEST_REQUIRES_ARM_NEON_DOT; 6716 for (size_t k = 1; k < 8; k++) { 6717 for (uint32_t n = 1; n <= 16; n++) { 6718 for (uint32_t m = 1; m <= 5; m++) { 6719 GemmMicrokernelTester() 6720 .mr(5) 6721 .nr(16) 6722 .kr(4) 6723 .sr(1) 6724 .m(m) 6725 .n(n) 6726 .k(k) 6727 .iterations(1) 6728 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6729 } 6730 } 6731 } 6732 } 6733 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_gt_8)6734 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_gt_8) { 6735 TEST_REQUIRES_ARM_NEON_DOT; 6736 for (size_t k = 9; k < 16; k++) { 6737 GemmMicrokernelTester() 6738 .mr(5) 6739 .nr(16) 6740 .kr(4) 6741 .sr(1) 6742 .m(5) 6743 .n(16) 6744 .k(k) 6745 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6746 } 6747 } 6748 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_gt_8_strided_a)6749 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_gt_8_strided_a) { 6750 TEST_REQUIRES_ARM_NEON_DOT; 6751 for (size_t k = 9; k < 16; k++) { 6752 GemmMicrokernelTester() 6753 .mr(5) 6754 .nr(16) 6755 .kr(4) 6756 .sr(1) 6757 .m(5) 6758 .n(16) 6759 .k(k) 6760 .a_stride(19) 6761 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6762 } 6763 } 6764 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_gt_8_subtile)6765 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_gt_8_subtile) { 6766 TEST_REQUIRES_ARM_NEON_DOT; 6767 for (size_t k = 9; k < 16; k++) { 6768 for (uint32_t n = 1; n <= 16; n++) { 6769 for (uint32_t m = 1; m <= 5; m++) { 6770 GemmMicrokernelTester() 6771 .mr(5) 6772 .nr(16) 6773 .kr(4) 6774 .sr(1) 6775 .m(m) 6776 .n(n) 6777 .k(k) 6778 .iterations(1) 6779 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6780 } 6781 } 6782 } 6783 } 6784 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_div_8)6785 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_div_8) { 6786 TEST_REQUIRES_ARM_NEON_DOT; 6787 for (size_t k = 16; k <= 80; k += 8) { 6788 GemmMicrokernelTester() 6789 .mr(5) 6790 .nr(16) 6791 .kr(4) 6792 .sr(1) 6793 .m(5) 6794 .n(16) 6795 .k(k) 6796 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6797 } 6798 } 6799 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_div_8_strided_a)6800 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_div_8_strided_a) { 6801 TEST_REQUIRES_ARM_NEON_DOT; 6802 for (size_t k = 16; k <= 80; k += 8) { 6803 GemmMicrokernelTester() 6804 .mr(5) 6805 .nr(16) 6806 .kr(4) 6807 .sr(1) 6808 .m(5) 6809 .n(16) 6810 .k(k) 6811 .a_stride(83) 6812 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6813 } 6814 } 6815 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,k_div_8_subtile)6816 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, k_div_8_subtile) { 6817 TEST_REQUIRES_ARM_NEON_DOT; 6818 for (size_t k = 16; k <= 80; k += 8) { 6819 for (uint32_t n = 1; n <= 16; n++) { 6820 for (uint32_t m = 1; m <= 5; m++) { 6821 GemmMicrokernelTester() 6822 .mr(5) 6823 .nr(16) 6824 .kr(4) 6825 .sr(1) 6826 .m(m) 6827 .n(n) 6828 .k(k) 6829 .iterations(1) 6830 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6831 } 6832 } 6833 } 6834 } 6835 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,n_gt_16)6836 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, n_gt_16) { 6837 TEST_REQUIRES_ARM_NEON_DOT; 6838 for (uint32_t n = 17; n < 32; n++) { 6839 for (size_t k = 1; k <= 40; k += 9) { 6840 GemmMicrokernelTester() 6841 .mr(5) 6842 .nr(16) 6843 .kr(4) 6844 .sr(1) 6845 .m(5) 6846 .n(n) 6847 .k(k) 6848 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6849 } 6850 } 6851 } 6852 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,n_gt_16_strided_cn)6853 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, n_gt_16_strided_cn) { 6854 TEST_REQUIRES_ARM_NEON_DOT; 6855 for (uint32_t n = 17; n < 32; n++) { 6856 for (size_t k = 1; k <= 40; k += 9) { 6857 GemmMicrokernelTester() 6858 .mr(5) 6859 .nr(16) 6860 .kr(4) 6861 .sr(1) 6862 .m(5) 6863 .n(n) 6864 .k(k) 6865 .cn_stride(19) 6866 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6867 } 6868 } 6869 } 6870 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,n_gt_16_strided_a)6871 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, n_gt_16_strided_a) { 6872 TEST_REQUIRES_ARM_NEON_DOT; 6873 for (uint32_t n = 17; n < 32; n++) { 6874 for (size_t k = 1; k <= 40; k += 9) { 6875 GemmMicrokernelTester() 6876 .mr(5) 6877 .nr(16) 6878 .kr(4) 6879 .sr(1) 6880 .m(5) 6881 .n(n) 6882 .k(k) 6883 .a_stride(43) 6884 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6885 } 6886 } 6887 } 6888 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,n_gt_16_subtile)6889 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, n_gt_16_subtile) { 6890 TEST_REQUIRES_ARM_NEON_DOT; 6891 for (uint32_t n = 17; n < 32; n++) { 6892 for (size_t k = 1; k <= 40; k += 9) { 6893 for (uint32_t m = 1; m <= 5; m++) { 6894 GemmMicrokernelTester() 6895 .mr(5) 6896 .nr(16) 6897 .kr(4) 6898 .sr(1) 6899 .m(m) 6900 .n(n) 6901 .k(k) 6902 .iterations(1) 6903 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6904 } 6905 } 6906 } 6907 } 6908 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,n_div_16)6909 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, n_div_16) { 6910 TEST_REQUIRES_ARM_NEON_DOT; 6911 for (uint32_t n = 32; n <= 48; n += 16) { 6912 for (size_t k = 1; k <= 40; k += 9) { 6913 GemmMicrokernelTester() 6914 .mr(5) 6915 .nr(16) 6916 .kr(4) 6917 .sr(1) 6918 .m(5) 6919 .n(n) 6920 .k(k) 6921 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6922 } 6923 } 6924 } 6925 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,n_div_16_strided_cn)6926 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, n_div_16_strided_cn) { 6927 TEST_REQUIRES_ARM_NEON_DOT; 6928 for (uint32_t n = 32; n <= 48; n += 16) { 6929 for (size_t k = 1; k <= 40; k += 9) { 6930 GemmMicrokernelTester() 6931 .mr(5) 6932 .nr(16) 6933 .kr(4) 6934 .sr(1) 6935 .m(5) 6936 .n(n) 6937 .k(k) 6938 .cn_stride(19) 6939 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6940 } 6941 } 6942 } 6943 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,n_div_16_strided_a)6944 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, n_div_16_strided_a) { 6945 TEST_REQUIRES_ARM_NEON_DOT; 6946 for (uint32_t n = 32; n <= 48; n += 16) { 6947 for (size_t k = 1; k <= 40; k += 9) { 6948 GemmMicrokernelTester() 6949 .mr(5) 6950 .nr(16) 6951 .kr(4) 6952 .sr(1) 6953 .m(5) 6954 .n(n) 6955 .k(k) 6956 .a_stride(43) 6957 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6958 } 6959 } 6960 } 6961 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,n_div_16_subtile)6962 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, n_div_16_subtile) { 6963 TEST_REQUIRES_ARM_NEON_DOT; 6964 for (uint32_t n = 32; n <= 48; n += 16) { 6965 for (size_t k = 1; k <= 40; k += 9) { 6966 for (uint32_t m = 1; m <= 5; m++) { 6967 GemmMicrokernelTester() 6968 .mr(5) 6969 .nr(16) 6970 .kr(4) 6971 .sr(1) 6972 .m(m) 6973 .n(n) 6974 .k(k) 6975 .iterations(1) 6976 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6977 } 6978 } 6979 } 6980 } 6981 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,strided_cm_subtile)6982 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, strided_cm_subtile) { 6983 TEST_REQUIRES_ARM_NEON_DOT; 6984 for (size_t k = 1; k <= 40; k += 9) { 6985 for (uint32_t n = 1; n <= 16; n++) { 6986 for (uint32_t m = 1; m <= 5; m++) { 6987 GemmMicrokernelTester() 6988 .mr(5) 6989 .nr(16) 6990 .kr(4) 6991 .sr(1) 6992 .m(m) 6993 .n(n) 6994 .k(k) 6995 .cm_stride(19) 6996 .iterations(1) 6997 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 6998 } 6999 } 7000 } 7001 } 7002 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,qmin)7003 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, qmin) { 7004 TEST_REQUIRES_ARM_NEON_DOT; 7005 GemmMicrokernelTester() 7006 .mr(5) 7007 .nr(16) 7008 .kr(4) 7009 .sr(1) 7010 .m(5) 7011 .n(16) 7012 .k(8) 7013 .qmin(128) 7014 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7015 } 7016 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,qmax)7017 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, qmax) { 7018 TEST_REQUIRES_ARM_NEON_DOT; 7019 GemmMicrokernelTester() 7020 .mr(5) 7021 .nr(16) 7022 .kr(4) 7023 .sr(1) 7024 .m(5) 7025 .n(16) 7026 .k(8) 7027 .qmax(128) 7028 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7029 } 7030 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,strided_cm)7031 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, strided_cm) { 7032 TEST_REQUIRES_ARM_NEON_DOT; 7033 GemmMicrokernelTester() 7034 .mr(5) 7035 .nr(16) 7036 .kr(4) 7037 .sr(1) 7038 .m(5) 7039 .n(16) 7040 .k(8) 7041 .cm_stride(19) 7042 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7043 } 7044 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,no_a_zero_point)7045 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, no_a_zero_point) { 7046 TEST_REQUIRES_ARM_NEON_DOT; 7047 for (size_t k = 1; k <= 40; k += 9) { 7048 GemmMicrokernelTester() 7049 .mr(5) 7050 .nr(16) 7051 .kr(4) 7052 .sr(1) 7053 .m(5) 7054 .n(16) 7055 .k(k) 7056 .a_zero_point(0) 7057 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7058 } 7059 } 7060 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,no_b_zero_point)7061 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, no_b_zero_point) { 7062 TEST_REQUIRES_ARM_NEON_DOT; 7063 for (size_t k = 1; k <= 40; k += 9) { 7064 GemmMicrokernelTester() 7065 .mr(5) 7066 .nr(16) 7067 .kr(4) 7068 .sr(1) 7069 .m(5) 7070 .n(16) 7071 .k(k) 7072 .b_zero_point(0) 7073 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7074 } 7075 } 7076 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT,no_zero_point)7077 TEST(QU8_GEMM_MINMAX_RNDNU_5X16C4__NEONDOT, no_zero_point) { 7078 TEST_REQUIRES_ARM_NEON_DOT; 7079 for (size_t k = 1; k <= 40; k += 9) { 7080 GemmMicrokernelTester() 7081 .mr(5) 7082 .nr(16) 7083 .kr(4) 7084 .sr(1) 7085 .m(5) 7086 .n(16) 7087 .k(k) 7088 .a_zero_point(0) 7089 .b_zero_point(0) 7090 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7091 } 7092 } 7093 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 7094 7095 7096 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_eq_8)7097 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8) { 7098 TEST_REQUIRES_ARM_NEON_DOT; 7099 GemmMicrokernelTester() 7100 .mr(8) 7101 .nr(16) 7102 .kr(4) 7103 .sr(1) 7104 .m(8) 7105 .n(16) 7106 .k(8) 7107 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7108 } 7109 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,strided_cn)7110 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, strided_cn) { 7111 TEST_REQUIRES_ARM_NEON_DOT; 7112 GemmMicrokernelTester() 7113 .mr(8) 7114 .nr(16) 7115 .kr(4) 7116 .sr(1) 7117 .m(8) 7118 .n(16) 7119 .k(8) 7120 .cn_stride(19) 7121 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7122 } 7123 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_eq_8_strided_a)7124 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8_strided_a) { 7125 TEST_REQUIRES_ARM_NEON_DOT; 7126 GemmMicrokernelTester() 7127 .mr(8) 7128 .nr(16) 7129 .kr(4) 7130 .sr(1) 7131 .m(8) 7132 .n(16) 7133 .k(8) 7134 .a_stride(11) 7135 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7136 } 7137 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_eq_8_subtile)7138 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8_subtile) { 7139 TEST_REQUIRES_ARM_NEON_DOT; 7140 for (uint32_t n = 1; n <= 16; n++) { 7141 for (uint32_t m = 1; m <= 8; m++) { 7142 GemmMicrokernelTester() 7143 .mr(8) 7144 .nr(16) 7145 .kr(4) 7146 .sr(1) 7147 .m(m) 7148 .n(n) 7149 .k(8) 7150 .iterations(1) 7151 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7152 } 7153 } 7154 } 7155 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_eq_8_subtile_m)7156 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8_subtile_m) { 7157 TEST_REQUIRES_ARM_NEON_DOT; 7158 for (uint32_t m = 1; m <= 8; m++) { 7159 GemmMicrokernelTester() 7160 .mr(8) 7161 .nr(16) 7162 .kr(4) 7163 .sr(1) 7164 .m(m) 7165 .n(16) 7166 .k(8) 7167 .iterations(1) 7168 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7169 } 7170 } 7171 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_eq_8_subtile_n)7172 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_eq_8_subtile_n) { 7173 TEST_REQUIRES_ARM_NEON_DOT; 7174 for (uint32_t n = 1; n <= 16; n++) { 7175 GemmMicrokernelTester() 7176 .mr(8) 7177 .nr(16) 7178 .kr(4) 7179 .sr(1) 7180 .m(8) 7181 .n(n) 7182 .k(8) 7183 .iterations(1) 7184 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7185 } 7186 } 7187 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_lt_8)7188 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_lt_8) { 7189 TEST_REQUIRES_ARM_NEON_DOT; 7190 for (size_t k = 1; k < 8; k++) { 7191 GemmMicrokernelTester() 7192 .mr(8) 7193 .nr(16) 7194 .kr(4) 7195 .sr(1) 7196 .m(8) 7197 .n(16) 7198 .k(k) 7199 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7200 } 7201 } 7202 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_lt_8_strided_a)7203 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_lt_8_strided_a) { 7204 TEST_REQUIRES_ARM_NEON_DOT; 7205 for (size_t k = 1; k < 8; k++) { 7206 GemmMicrokernelTester() 7207 .mr(8) 7208 .nr(16) 7209 .kr(4) 7210 .sr(1) 7211 .m(8) 7212 .n(16) 7213 .k(k) 7214 .a_stride(11) 7215 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7216 } 7217 } 7218 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_lt_8_subtile)7219 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_lt_8_subtile) { 7220 TEST_REQUIRES_ARM_NEON_DOT; 7221 for (size_t k = 1; k < 8; k++) { 7222 for (uint32_t n = 1; n <= 16; n++) { 7223 for (uint32_t m = 1; m <= 8; m++) { 7224 GemmMicrokernelTester() 7225 .mr(8) 7226 .nr(16) 7227 .kr(4) 7228 .sr(1) 7229 .m(m) 7230 .n(n) 7231 .k(k) 7232 .iterations(1) 7233 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7234 } 7235 } 7236 } 7237 } 7238 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_gt_8)7239 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_gt_8) { 7240 TEST_REQUIRES_ARM_NEON_DOT; 7241 for (size_t k = 9; k < 16; k++) { 7242 GemmMicrokernelTester() 7243 .mr(8) 7244 .nr(16) 7245 .kr(4) 7246 .sr(1) 7247 .m(8) 7248 .n(16) 7249 .k(k) 7250 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7251 } 7252 } 7253 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_gt_8_strided_a)7254 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_gt_8_strided_a) { 7255 TEST_REQUIRES_ARM_NEON_DOT; 7256 for (size_t k = 9; k < 16; k++) { 7257 GemmMicrokernelTester() 7258 .mr(8) 7259 .nr(16) 7260 .kr(4) 7261 .sr(1) 7262 .m(8) 7263 .n(16) 7264 .k(k) 7265 .a_stride(19) 7266 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7267 } 7268 } 7269 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_gt_8_subtile)7270 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_gt_8_subtile) { 7271 TEST_REQUIRES_ARM_NEON_DOT; 7272 for (size_t k = 9; k < 16; k++) { 7273 for (uint32_t n = 1; n <= 16; n++) { 7274 for (uint32_t m = 1; m <= 8; m++) { 7275 GemmMicrokernelTester() 7276 .mr(8) 7277 .nr(16) 7278 .kr(4) 7279 .sr(1) 7280 .m(m) 7281 .n(n) 7282 .k(k) 7283 .iterations(1) 7284 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7285 } 7286 } 7287 } 7288 } 7289 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_div_8)7290 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_div_8) { 7291 TEST_REQUIRES_ARM_NEON_DOT; 7292 for (size_t k = 16; k <= 80; k += 8) { 7293 GemmMicrokernelTester() 7294 .mr(8) 7295 .nr(16) 7296 .kr(4) 7297 .sr(1) 7298 .m(8) 7299 .n(16) 7300 .k(k) 7301 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7302 } 7303 } 7304 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_div_8_strided_a)7305 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_div_8_strided_a) { 7306 TEST_REQUIRES_ARM_NEON_DOT; 7307 for (size_t k = 16; k <= 80; k += 8) { 7308 GemmMicrokernelTester() 7309 .mr(8) 7310 .nr(16) 7311 .kr(4) 7312 .sr(1) 7313 .m(8) 7314 .n(16) 7315 .k(k) 7316 .a_stride(83) 7317 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7318 } 7319 } 7320 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,k_div_8_subtile)7321 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, k_div_8_subtile) { 7322 TEST_REQUIRES_ARM_NEON_DOT; 7323 for (size_t k = 16; k <= 80; k += 8) { 7324 for (uint32_t n = 1; n <= 16; n++) { 7325 for (uint32_t m = 1; m <= 8; m++) { 7326 GemmMicrokernelTester() 7327 .mr(8) 7328 .nr(16) 7329 .kr(4) 7330 .sr(1) 7331 .m(m) 7332 .n(n) 7333 .k(k) 7334 .iterations(1) 7335 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7336 } 7337 } 7338 } 7339 } 7340 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,n_gt_16)7341 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_gt_16) { 7342 TEST_REQUIRES_ARM_NEON_DOT; 7343 for (uint32_t n = 17; n < 32; n++) { 7344 for (size_t k = 1; k <= 40; k += 9) { 7345 GemmMicrokernelTester() 7346 .mr(8) 7347 .nr(16) 7348 .kr(4) 7349 .sr(1) 7350 .m(8) 7351 .n(n) 7352 .k(k) 7353 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7354 } 7355 } 7356 } 7357 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,n_gt_16_strided_cn)7358 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_gt_16_strided_cn) { 7359 TEST_REQUIRES_ARM_NEON_DOT; 7360 for (uint32_t n = 17; n < 32; n++) { 7361 for (size_t k = 1; k <= 40; k += 9) { 7362 GemmMicrokernelTester() 7363 .mr(8) 7364 .nr(16) 7365 .kr(4) 7366 .sr(1) 7367 .m(8) 7368 .n(n) 7369 .k(k) 7370 .cn_stride(19) 7371 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7372 } 7373 } 7374 } 7375 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,n_gt_16_strided_a)7376 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_gt_16_strided_a) { 7377 TEST_REQUIRES_ARM_NEON_DOT; 7378 for (uint32_t n = 17; n < 32; n++) { 7379 for (size_t k = 1; k <= 40; k += 9) { 7380 GemmMicrokernelTester() 7381 .mr(8) 7382 .nr(16) 7383 .kr(4) 7384 .sr(1) 7385 .m(8) 7386 .n(n) 7387 .k(k) 7388 .a_stride(43) 7389 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7390 } 7391 } 7392 } 7393 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,n_gt_16_subtile)7394 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_gt_16_subtile) { 7395 TEST_REQUIRES_ARM_NEON_DOT; 7396 for (uint32_t n = 17; n < 32; n++) { 7397 for (size_t k = 1; k <= 40; k += 9) { 7398 for (uint32_t m = 1; m <= 8; m++) { 7399 GemmMicrokernelTester() 7400 .mr(8) 7401 .nr(16) 7402 .kr(4) 7403 .sr(1) 7404 .m(m) 7405 .n(n) 7406 .k(k) 7407 .iterations(1) 7408 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7409 } 7410 } 7411 } 7412 } 7413 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,n_div_16)7414 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_div_16) { 7415 TEST_REQUIRES_ARM_NEON_DOT; 7416 for (uint32_t n = 32; n <= 48; n += 16) { 7417 for (size_t k = 1; k <= 40; k += 9) { 7418 GemmMicrokernelTester() 7419 .mr(8) 7420 .nr(16) 7421 .kr(4) 7422 .sr(1) 7423 .m(8) 7424 .n(n) 7425 .k(k) 7426 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7427 } 7428 } 7429 } 7430 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,n_div_16_strided_cn)7431 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_div_16_strided_cn) { 7432 TEST_REQUIRES_ARM_NEON_DOT; 7433 for (uint32_t n = 32; n <= 48; n += 16) { 7434 for (size_t k = 1; k <= 40; k += 9) { 7435 GemmMicrokernelTester() 7436 .mr(8) 7437 .nr(16) 7438 .kr(4) 7439 .sr(1) 7440 .m(8) 7441 .n(n) 7442 .k(k) 7443 .cn_stride(19) 7444 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7445 } 7446 } 7447 } 7448 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,n_div_16_strided_a)7449 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_div_16_strided_a) { 7450 TEST_REQUIRES_ARM_NEON_DOT; 7451 for (uint32_t n = 32; n <= 48; n += 16) { 7452 for (size_t k = 1; k <= 40; k += 9) { 7453 GemmMicrokernelTester() 7454 .mr(8) 7455 .nr(16) 7456 .kr(4) 7457 .sr(1) 7458 .m(8) 7459 .n(n) 7460 .k(k) 7461 .a_stride(43) 7462 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7463 } 7464 } 7465 } 7466 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,n_div_16_subtile)7467 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, n_div_16_subtile) { 7468 TEST_REQUIRES_ARM_NEON_DOT; 7469 for (uint32_t n = 32; n <= 48; n += 16) { 7470 for (size_t k = 1; k <= 40; k += 9) { 7471 for (uint32_t m = 1; m <= 8; m++) { 7472 GemmMicrokernelTester() 7473 .mr(8) 7474 .nr(16) 7475 .kr(4) 7476 .sr(1) 7477 .m(m) 7478 .n(n) 7479 .k(k) 7480 .iterations(1) 7481 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7482 } 7483 } 7484 } 7485 } 7486 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,strided_cm_subtile)7487 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, strided_cm_subtile) { 7488 TEST_REQUIRES_ARM_NEON_DOT; 7489 for (size_t k = 1; k <= 40; k += 9) { 7490 for (uint32_t n = 1; n <= 16; n++) { 7491 for (uint32_t m = 1; m <= 8; m++) { 7492 GemmMicrokernelTester() 7493 .mr(8) 7494 .nr(16) 7495 .kr(4) 7496 .sr(1) 7497 .m(m) 7498 .n(n) 7499 .k(k) 7500 .cm_stride(19) 7501 .iterations(1) 7502 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7503 } 7504 } 7505 } 7506 } 7507 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,qmin)7508 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, qmin) { 7509 TEST_REQUIRES_ARM_NEON_DOT; 7510 GemmMicrokernelTester() 7511 .mr(8) 7512 .nr(16) 7513 .kr(4) 7514 .sr(1) 7515 .m(8) 7516 .n(16) 7517 .k(8) 7518 .qmin(128) 7519 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7520 } 7521 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,qmax)7522 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, qmax) { 7523 TEST_REQUIRES_ARM_NEON_DOT; 7524 GemmMicrokernelTester() 7525 .mr(8) 7526 .nr(16) 7527 .kr(4) 7528 .sr(1) 7529 .m(8) 7530 .n(16) 7531 .k(8) 7532 .qmax(128) 7533 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7534 } 7535 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,strided_cm)7536 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, strided_cm) { 7537 TEST_REQUIRES_ARM_NEON_DOT; 7538 GemmMicrokernelTester() 7539 .mr(8) 7540 .nr(16) 7541 .kr(4) 7542 .sr(1) 7543 .m(8) 7544 .n(16) 7545 .k(8) 7546 .cm_stride(19) 7547 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7548 } 7549 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,no_a_zero_point)7550 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, no_a_zero_point) { 7551 TEST_REQUIRES_ARM_NEON_DOT; 7552 for (size_t k = 1; k <= 40; k += 9) { 7553 GemmMicrokernelTester() 7554 .mr(8) 7555 .nr(16) 7556 .kr(4) 7557 .sr(1) 7558 .m(8) 7559 .n(16) 7560 .k(k) 7561 .a_zero_point(0) 7562 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7563 } 7564 } 7565 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,no_b_zero_point)7566 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, no_b_zero_point) { 7567 TEST_REQUIRES_ARM_NEON_DOT; 7568 for (size_t k = 1; k <= 40; k += 9) { 7569 GemmMicrokernelTester() 7570 .mr(8) 7571 .nr(16) 7572 .kr(4) 7573 .sr(1) 7574 .m(8) 7575 .n(16) 7576 .k(k) 7577 .b_zero_point(0) 7578 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7579 } 7580 } 7581 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT,no_zero_point)7582 TEST(QU8_GEMM_MINMAX_RNDNU_8X16C4__NEONDOT, no_zero_point) { 7583 TEST_REQUIRES_ARM_NEON_DOT; 7584 for (size_t k = 1; k <= 40; k += 9) { 7585 GemmMicrokernelTester() 7586 .mr(8) 7587 .nr(16) 7588 .kr(4) 7589 .sr(1) 7590 .m(8) 7591 .n(16) 7592 .k(k) 7593 .a_zero_point(0) 7594 .b_zero_point(0) 7595 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7596 } 7597 } 7598 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 7599 7600 7601 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_eq_8)7602 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_eq_8) { 7603 TEST_REQUIRES_ARM_NEON_DOT; 7604 GemmMicrokernelTester() 7605 .mr(2) 7606 .nr(32) 7607 .kr(4) 7608 .sr(1) 7609 .m(2) 7610 .n(32) 7611 .k(8) 7612 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7613 } 7614 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,strided_cn)7615 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, strided_cn) { 7616 TEST_REQUIRES_ARM_NEON_DOT; 7617 GemmMicrokernelTester() 7618 .mr(2) 7619 .nr(32) 7620 .kr(4) 7621 .sr(1) 7622 .m(2) 7623 .n(32) 7624 .k(8) 7625 .cn_stride(37) 7626 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7627 } 7628 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_eq_8_strided_a)7629 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_eq_8_strided_a) { 7630 TEST_REQUIRES_ARM_NEON_DOT; 7631 GemmMicrokernelTester() 7632 .mr(2) 7633 .nr(32) 7634 .kr(4) 7635 .sr(1) 7636 .m(2) 7637 .n(32) 7638 .k(8) 7639 .a_stride(11) 7640 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7641 } 7642 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_eq_8_subtile)7643 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_eq_8_subtile) { 7644 TEST_REQUIRES_ARM_NEON_DOT; 7645 for (uint32_t n = 1; n <= 32; n++) { 7646 for (uint32_t m = 1; m <= 2; m++) { 7647 GemmMicrokernelTester() 7648 .mr(2) 7649 .nr(32) 7650 .kr(4) 7651 .sr(1) 7652 .m(m) 7653 .n(n) 7654 .k(8) 7655 .iterations(1) 7656 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7657 } 7658 } 7659 } 7660 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_eq_8_subtile_m)7661 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_eq_8_subtile_m) { 7662 TEST_REQUIRES_ARM_NEON_DOT; 7663 for (uint32_t m = 1; m <= 2; m++) { 7664 GemmMicrokernelTester() 7665 .mr(2) 7666 .nr(32) 7667 .kr(4) 7668 .sr(1) 7669 .m(m) 7670 .n(32) 7671 .k(8) 7672 .iterations(1) 7673 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7674 } 7675 } 7676 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_eq_8_subtile_n)7677 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_eq_8_subtile_n) { 7678 TEST_REQUIRES_ARM_NEON_DOT; 7679 for (uint32_t n = 1; n <= 32; n++) { 7680 GemmMicrokernelTester() 7681 .mr(2) 7682 .nr(32) 7683 .kr(4) 7684 .sr(1) 7685 .m(2) 7686 .n(n) 7687 .k(8) 7688 .iterations(1) 7689 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7690 } 7691 } 7692 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_lt_8)7693 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_lt_8) { 7694 TEST_REQUIRES_ARM_NEON_DOT; 7695 for (size_t k = 1; k < 8; k++) { 7696 GemmMicrokernelTester() 7697 .mr(2) 7698 .nr(32) 7699 .kr(4) 7700 .sr(1) 7701 .m(2) 7702 .n(32) 7703 .k(k) 7704 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7705 } 7706 } 7707 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_lt_8_strided_a)7708 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_lt_8_strided_a) { 7709 TEST_REQUIRES_ARM_NEON_DOT; 7710 for (size_t k = 1; k < 8; k++) { 7711 GemmMicrokernelTester() 7712 .mr(2) 7713 .nr(32) 7714 .kr(4) 7715 .sr(1) 7716 .m(2) 7717 .n(32) 7718 .k(k) 7719 .a_stride(11) 7720 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7721 } 7722 } 7723 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_lt_8_subtile)7724 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_lt_8_subtile) { 7725 TEST_REQUIRES_ARM_NEON_DOT; 7726 for (size_t k = 1; k < 8; k++) { 7727 for (uint32_t n = 1; n <= 32; n++) { 7728 for (uint32_t m = 1; m <= 2; m++) { 7729 GemmMicrokernelTester() 7730 .mr(2) 7731 .nr(32) 7732 .kr(4) 7733 .sr(1) 7734 .m(m) 7735 .n(n) 7736 .k(k) 7737 .iterations(1) 7738 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7739 } 7740 } 7741 } 7742 } 7743 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_gt_8)7744 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_gt_8) { 7745 TEST_REQUIRES_ARM_NEON_DOT; 7746 for (size_t k = 9; k < 16; k++) { 7747 GemmMicrokernelTester() 7748 .mr(2) 7749 .nr(32) 7750 .kr(4) 7751 .sr(1) 7752 .m(2) 7753 .n(32) 7754 .k(k) 7755 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7756 } 7757 } 7758 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_gt_8_strided_a)7759 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_gt_8_strided_a) { 7760 TEST_REQUIRES_ARM_NEON_DOT; 7761 for (size_t k = 9; k < 16; k++) { 7762 GemmMicrokernelTester() 7763 .mr(2) 7764 .nr(32) 7765 .kr(4) 7766 .sr(1) 7767 .m(2) 7768 .n(32) 7769 .k(k) 7770 .a_stride(19) 7771 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7772 } 7773 } 7774 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_gt_8_subtile)7775 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_gt_8_subtile) { 7776 TEST_REQUIRES_ARM_NEON_DOT; 7777 for (size_t k = 9; k < 16; k++) { 7778 for (uint32_t n = 1; n <= 32; n++) { 7779 for (uint32_t m = 1; m <= 2; m++) { 7780 GemmMicrokernelTester() 7781 .mr(2) 7782 .nr(32) 7783 .kr(4) 7784 .sr(1) 7785 .m(m) 7786 .n(n) 7787 .k(k) 7788 .iterations(1) 7789 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7790 } 7791 } 7792 } 7793 } 7794 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_div_8)7795 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_div_8) { 7796 TEST_REQUIRES_ARM_NEON_DOT; 7797 for (size_t k = 16; k <= 80; k += 8) { 7798 GemmMicrokernelTester() 7799 .mr(2) 7800 .nr(32) 7801 .kr(4) 7802 .sr(1) 7803 .m(2) 7804 .n(32) 7805 .k(k) 7806 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7807 } 7808 } 7809 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_div_8_strided_a)7810 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_div_8_strided_a) { 7811 TEST_REQUIRES_ARM_NEON_DOT; 7812 for (size_t k = 16; k <= 80; k += 8) { 7813 GemmMicrokernelTester() 7814 .mr(2) 7815 .nr(32) 7816 .kr(4) 7817 .sr(1) 7818 .m(2) 7819 .n(32) 7820 .k(k) 7821 .a_stride(83) 7822 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7823 } 7824 } 7825 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,k_div_8_subtile)7826 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, k_div_8_subtile) { 7827 TEST_REQUIRES_ARM_NEON_DOT; 7828 for (size_t k = 16; k <= 80; k += 8) { 7829 for (uint32_t n = 1; n <= 32; n++) { 7830 for (uint32_t m = 1; m <= 2; m++) { 7831 GemmMicrokernelTester() 7832 .mr(2) 7833 .nr(32) 7834 .kr(4) 7835 .sr(1) 7836 .m(m) 7837 .n(n) 7838 .k(k) 7839 .iterations(1) 7840 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7841 } 7842 } 7843 } 7844 } 7845 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,n_gt_32)7846 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, n_gt_32) { 7847 TEST_REQUIRES_ARM_NEON_DOT; 7848 for (uint32_t n = 33; n < 64; n++) { 7849 for (size_t k = 1; k <= 40; k += 9) { 7850 GemmMicrokernelTester() 7851 .mr(2) 7852 .nr(32) 7853 .kr(4) 7854 .sr(1) 7855 .m(2) 7856 .n(n) 7857 .k(k) 7858 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7859 } 7860 } 7861 } 7862 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,n_gt_32_strided_cn)7863 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, n_gt_32_strided_cn) { 7864 TEST_REQUIRES_ARM_NEON_DOT; 7865 for (uint32_t n = 33; n < 64; n++) { 7866 for (size_t k = 1; k <= 40; k += 9) { 7867 GemmMicrokernelTester() 7868 .mr(2) 7869 .nr(32) 7870 .kr(4) 7871 .sr(1) 7872 .m(2) 7873 .n(n) 7874 .k(k) 7875 .cn_stride(37) 7876 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7877 } 7878 } 7879 } 7880 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,n_gt_32_strided_a)7881 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, n_gt_32_strided_a) { 7882 TEST_REQUIRES_ARM_NEON_DOT; 7883 for (uint32_t n = 33; n < 64; n++) { 7884 for (size_t k = 1; k <= 40; k += 9) { 7885 GemmMicrokernelTester() 7886 .mr(2) 7887 .nr(32) 7888 .kr(4) 7889 .sr(1) 7890 .m(2) 7891 .n(n) 7892 .k(k) 7893 .a_stride(43) 7894 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7895 } 7896 } 7897 } 7898 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,n_gt_32_subtile)7899 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, n_gt_32_subtile) { 7900 TEST_REQUIRES_ARM_NEON_DOT; 7901 for (uint32_t n = 33; n < 64; n++) { 7902 for (size_t k = 1; k <= 40; k += 9) { 7903 for (uint32_t m = 1; m <= 2; m++) { 7904 GemmMicrokernelTester() 7905 .mr(2) 7906 .nr(32) 7907 .kr(4) 7908 .sr(1) 7909 .m(m) 7910 .n(n) 7911 .k(k) 7912 .iterations(1) 7913 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7914 } 7915 } 7916 } 7917 } 7918 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,n_div_32)7919 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, n_div_32) { 7920 TEST_REQUIRES_ARM_NEON_DOT; 7921 for (uint32_t n = 64; n <= 96; n += 32) { 7922 for (size_t k = 1; k <= 40; k += 9) { 7923 GemmMicrokernelTester() 7924 .mr(2) 7925 .nr(32) 7926 .kr(4) 7927 .sr(1) 7928 .m(2) 7929 .n(n) 7930 .k(k) 7931 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7932 } 7933 } 7934 } 7935 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,n_div_32_strided_cn)7936 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, n_div_32_strided_cn) { 7937 TEST_REQUIRES_ARM_NEON_DOT; 7938 for (uint32_t n = 64; n <= 96; n += 32) { 7939 for (size_t k = 1; k <= 40; k += 9) { 7940 GemmMicrokernelTester() 7941 .mr(2) 7942 .nr(32) 7943 .kr(4) 7944 .sr(1) 7945 .m(2) 7946 .n(n) 7947 .k(k) 7948 .cn_stride(37) 7949 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7950 } 7951 } 7952 } 7953 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,n_div_32_strided_a)7954 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, n_div_32_strided_a) { 7955 TEST_REQUIRES_ARM_NEON_DOT; 7956 for (uint32_t n = 64; n <= 96; n += 32) { 7957 for (size_t k = 1; k <= 40; k += 9) { 7958 GemmMicrokernelTester() 7959 .mr(2) 7960 .nr(32) 7961 .kr(4) 7962 .sr(1) 7963 .m(2) 7964 .n(n) 7965 .k(k) 7966 .a_stride(43) 7967 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7968 } 7969 } 7970 } 7971 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,n_div_32_subtile)7972 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, n_div_32_subtile) { 7973 TEST_REQUIRES_ARM_NEON_DOT; 7974 for (uint32_t n = 64; n <= 96; n += 32) { 7975 for (size_t k = 1; k <= 40; k += 9) { 7976 for (uint32_t m = 1; m <= 2; m++) { 7977 GemmMicrokernelTester() 7978 .mr(2) 7979 .nr(32) 7980 .kr(4) 7981 .sr(1) 7982 .m(m) 7983 .n(n) 7984 .k(k) 7985 .iterations(1) 7986 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 7987 } 7988 } 7989 } 7990 } 7991 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,strided_cm_subtile)7992 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, strided_cm_subtile) { 7993 TEST_REQUIRES_ARM_NEON_DOT; 7994 for (size_t k = 1; k <= 40; k += 9) { 7995 for (uint32_t n = 1; n <= 32; n++) { 7996 for (uint32_t m = 1; m <= 2; m++) { 7997 GemmMicrokernelTester() 7998 .mr(2) 7999 .nr(32) 8000 .kr(4) 8001 .sr(1) 8002 .m(m) 8003 .n(n) 8004 .k(k) 8005 .cm_stride(37) 8006 .iterations(1) 8007 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8008 } 8009 } 8010 } 8011 } 8012 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,qmin)8013 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, qmin) { 8014 TEST_REQUIRES_ARM_NEON_DOT; 8015 GemmMicrokernelTester() 8016 .mr(2) 8017 .nr(32) 8018 .kr(4) 8019 .sr(1) 8020 .m(2) 8021 .n(32) 8022 .k(8) 8023 .qmin(128) 8024 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8025 } 8026 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,qmax)8027 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, qmax) { 8028 TEST_REQUIRES_ARM_NEON_DOT; 8029 GemmMicrokernelTester() 8030 .mr(2) 8031 .nr(32) 8032 .kr(4) 8033 .sr(1) 8034 .m(2) 8035 .n(32) 8036 .k(8) 8037 .qmax(128) 8038 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8039 } 8040 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,strided_cm)8041 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, strided_cm) { 8042 TEST_REQUIRES_ARM_NEON_DOT; 8043 GemmMicrokernelTester() 8044 .mr(2) 8045 .nr(32) 8046 .kr(4) 8047 .sr(1) 8048 .m(2) 8049 .n(32) 8050 .k(8) 8051 .cm_stride(37) 8052 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8053 } 8054 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,no_a_zero_point)8055 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, no_a_zero_point) { 8056 TEST_REQUIRES_ARM_NEON_DOT; 8057 for (size_t k = 1; k <= 40; k += 9) { 8058 GemmMicrokernelTester() 8059 .mr(2) 8060 .nr(32) 8061 .kr(4) 8062 .sr(1) 8063 .m(2) 8064 .n(32) 8065 .k(k) 8066 .a_zero_point(0) 8067 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8068 } 8069 } 8070 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,no_b_zero_point)8071 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, no_b_zero_point) { 8072 TEST_REQUIRES_ARM_NEON_DOT; 8073 for (size_t k = 1; k <= 40; k += 9) { 8074 GemmMicrokernelTester() 8075 .mr(2) 8076 .nr(32) 8077 .kr(4) 8078 .sr(1) 8079 .m(2) 8080 .n(32) 8081 .k(k) 8082 .b_zero_point(0) 8083 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8084 } 8085 } 8086 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT,no_zero_point)8087 TEST(QU8_GEMM_MINMAX_RNDNU_2X32C4__NEONDOT, no_zero_point) { 8088 TEST_REQUIRES_ARM_NEON_DOT; 8089 for (size_t k = 1; k <= 40; k += 9) { 8090 GemmMicrokernelTester() 8091 .mr(2) 8092 .nr(32) 8093 .kr(4) 8094 .sr(1) 8095 .m(2) 8096 .n(32) 8097 .k(k) 8098 .a_zero_point(0) 8099 .b_zero_point(0) 8100 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8101 } 8102 } 8103 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 8104 8105 8106 #if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_eq_8)8107 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_eq_8) { 8108 TEST_REQUIRES_ARM_NEON_DOT; 8109 GemmMicrokernelTester() 8110 .mr(3) 8111 .nr(32) 8112 .kr(4) 8113 .sr(1) 8114 .m(3) 8115 .n(32) 8116 .k(8) 8117 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8118 } 8119 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,strided_cn)8120 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, strided_cn) { 8121 TEST_REQUIRES_ARM_NEON_DOT; 8122 GemmMicrokernelTester() 8123 .mr(3) 8124 .nr(32) 8125 .kr(4) 8126 .sr(1) 8127 .m(3) 8128 .n(32) 8129 .k(8) 8130 .cn_stride(37) 8131 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8132 } 8133 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_eq_8_strided_a)8134 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_eq_8_strided_a) { 8135 TEST_REQUIRES_ARM_NEON_DOT; 8136 GemmMicrokernelTester() 8137 .mr(3) 8138 .nr(32) 8139 .kr(4) 8140 .sr(1) 8141 .m(3) 8142 .n(32) 8143 .k(8) 8144 .a_stride(11) 8145 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8146 } 8147 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_eq_8_subtile)8148 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_eq_8_subtile) { 8149 TEST_REQUIRES_ARM_NEON_DOT; 8150 for (uint32_t n = 1; n <= 32; n++) { 8151 for (uint32_t m = 1; m <= 3; m++) { 8152 GemmMicrokernelTester() 8153 .mr(3) 8154 .nr(32) 8155 .kr(4) 8156 .sr(1) 8157 .m(m) 8158 .n(n) 8159 .k(8) 8160 .iterations(1) 8161 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8162 } 8163 } 8164 } 8165 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_eq_8_subtile_m)8166 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_eq_8_subtile_m) { 8167 TEST_REQUIRES_ARM_NEON_DOT; 8168 for (uint32_t m = 1; m <= 3; m++) { 8169 GemmMicrokernelTester() 8170 .mr(3) 8171 .nr(32) 8172 .kr(4) 8173 .sr(1) 8174 .m(m) 8175 .n(32) 8176 .k(8) 8177 .iterations(1) 8178 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8179 } 8180 } 8181 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_eq_8_subtile_n)8182 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_eq_8_subtile_n) { 8183 TEST_REQUIRES_ARM_NEON_DOT; 8184 for (uint32_t n = 1; n <= 32; n++) { 8185 GemmMicrokernelTester() 8186 .mr(3) 8187 .nr(32) 8188 .kr(4) 8189 .sr(1) 8190 .m(3) 8191 .n(n) 8192 .k(8) 8193 .iterations(1) 8194 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8195 } 8196 } 8197 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_lt_8)8198 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_lt_8) { 8199 TEST_REQUIRES_ARM_NEON_DOT; 8200 for (size_t k = 1; k < 8; k++) { 8201 GemmMicrokernelTester() 8202 .mr(3) 8203 .nr(32) 8204 .kr(4) 8205 .sr(1) 8206 .m(3) 8207 .n(32) 8208 .k(k) 8209 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8210 } 8211 } 8212 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_lt_8_strided_a)8213 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_lt_8_strided_a) { 8214 TEST_REQUIRES_ARM_NEON_DOT; 8215 for (size_t k = 1; k < 8; k++) { 8216 GemmMicrokernelTester() 8217 .mr(3) 8218 .nr(32) 8219 .kr(4) 8220 .sr(1) 8221 .m(3) 8222 .n(32) 8223 .k(k) 8224 .a_stride(11) 8225 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8226 } 8227 } 8228 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_lt_8_subtile)8229 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_lt_8_subtile) { 8230 TEST_REQUIRES_ARM_NEON_DOT; 8231 for (size_t k = 1; k < 8; k++) { 8232 for (uint32_t n = 1; n <= 32; n++) { 8233 for (uint32_t m = 1; m <= 3; m++) { 8234 GemmMicrokernelTester() 8235 .mr(3) 8236 .nr(32) 8237 .kr(4) 8238 .sr(1) 8239 .m(m) 8240 .n(n) 8241 .k(k) 8242 .iterations(1) 8243 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8244 } 8245 } 8246 } 8247 } 8248 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_gt_8)8249 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_gt_8) { 8250 TEST_REQUIRES_ARM_NEON_DOT; 8251 for (size_t k = 9; k < 16; k++) { 8252 GemmMicrokernelTester() 8253 .mr(3) 8254 .nr(32) 8255 .kr(4) 8256 .sr(1) 8257 .m(3) 8258 .n(32) 8259 .k(k) 8260 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8261 } 8262 } 8263 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_gt_8_strided_a)8264 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_gt_8_strided_a) { 8265 TEST_REQUIRES_ARM_NEON_DOT; 8266 for (size_t k = 9; k < 16; k++) { 8267 GemmMicrokernelTester() 8268 .mr(3) 8269 .nr(32) 8270 .kr(4) 8271 .sr(1) 8272 .m(3) 8273 .n(32) 8274 .k(k) 8275 .a_stride(19) 8276 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8277 } 8278 } 8279 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_gt_8_subtile)8280 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_gt_8_subtile) { 8281 TEST_REQUIRES_ARM_NEON_DOT; 8282 for (size_t k = 9; k < 16; k++) { 8283 for (uint32_t n = 1; n <= 32; n++) { 8284 for (uint32_t m = 1; m <= 3; m++) { 8285 GemmMicrokernelTester() 8286 .mr(3) 8287 .nr(32) 8288 .kr(4) 8289 .sr(1) 8290 .m(m) 8291 .n(n) 8292 .k(k) 8293 .iterations(1) 8294 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8295 } 8296 } 8297 } 8298 } 8299 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_div_8)8300 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_div_8) { 8301 TEST_REQUIRES_ARM_NEON_DOT; 8302 for (size_t k = 16; k <= 80; k += 8) { 8303 GemmMicrokernelTester() 8304 .mr(3) 8305 .nr(32) 8306 .kr(4) 8307 .sr(1) 8308 .m(3) 8309 .n(32) 8310 .k(k) 8311 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8312 } 8313 } 8314 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_div_8_strided_a)8315 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_div_8_strided_a) { 8316 TEST_REQUIRES_ARM_NEON_DOT; 8317 for (size_t k = 16; k <= 80; k += 8) { 8318 GemmMicrokernelTester() 8319 .mr(3) 8320 .nr(32) 8321 .kr(4) 8322 .sr(1) 8323 .m(3) 8324 .n(32) 8325 .k(k) 8326 .a_stride(83) 8327 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8328 } 8329 } 8330 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,k_div_8_subtile)8331 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, k_div_8_subtile) { 8332 TEST_REQUIRES_ARM_NEON_DOT; 8333 for (size_t k = 16; k <= 80; k += 8) { 8334 for (uint32_t n = 1; n <= 32; n++) { 8335 for (uint32_t m = 1; m <= 3; m++) { 8336 GemmMicrokernelTester() 8337 .mr(3) 8338 .nr(32) 8339 .kr(4) 8340 .sr(1) 8341 .m(m) 8342 .n(n) 8343 .k(k) 8344 .iterations(1) 8345 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8346 } 8347 } 8348 } 8349 } 8350 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,n_gt_32)8351 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, n_gt_32) { 8352 TEST_REQUIRES_ARM_NEON_DOT; 8353 for (uint32_t n = 33; n < 64; n++) { 8354 for (size_t k = 1; k <= 40; k += 9) { 8355 GemmMicrokernelTester() 8356 .mr(3) 8357 .nr(32) 8358 .kr(4) 8359 .sr(1) 8360 .m(3) 8361 .n(n) 8362 .k(k) 8363 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8364 } 8365 } 8366 } 8367 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,n_gt_32_strided_cn)8368 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, n_gt_32_strided_cn) { 8369 TEST_REQUIRES_ARM_NEON_DOT; 8370 for (uint32_t n = 33; n < 64; n++) { 8371 for (size_t k = 1; k <= 40; k += 9) { 8372 GemmMicrokernelTester() 8373 .mr(3) 8374 .nr(32) 8375 .kr(4) 8376 .sr(1) 8377 .m(3) 8378 .n(n) 8379 .k(k) 8380 .cn_stride(37) 8381 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8382 } 8383 } 8384 } 8385 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,n_gt_32_strided_a)8386 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, n_gt_32_strided_a) { 8387 TEST_REQUIRES_ARM_NEON_DOT; 8388 for (uint32_t n = 33; n < 64; n++) { 8389 for (size_t k = 1; k <= 40; k += 9) { 8390 GemmMicrokernelTester() 8391 .mr(3) 8392 .nr(32) 8393 .kr(4) 8394 .sr(1) 8395 .m(3) 8396 .n(n) 8397 .k(k) 8398 .a_stride(43) 8399 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8400 } 8401 } 8402 } 8403 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,n_gt_32_subtile)8404 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, n_gt_32_subtile) { 8405 TEST_REQUIRES_ARM_NEON_DOT; 8406 for (uint32_t n = 33; n < 64; n++) { 8407 for (size_t k = 1; k <= 40; k += 9) { 8408 for (uint32_t m = 1; m <= 3; m++) { 8409 GemmMicrokernelTester() 8410 .mr(3) 8411 .nr(32) 8412 .kr(4) 8413 .sr(1) 8414 .m(m) 8415 .n(n) 8416 .k(k) 8417 .iterations(1) 8418 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8419 } 8420 } 8421 } 8422 } 8423 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,n_div_32)8424 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, n_div_32) { 8425 TEST_REQUIRES_ARM_NEON_DOT; 8426 for (uint32_t n = 64; n <= 96; n += 32) { 8427 for (size_t k = 1; k <= 40; k += 9) { 8428 GemmMicrokernelTester() 8429 .mr(3) 8430 .nr(32) 8431 .kr(4) 8432 .sr(1) 8433 .m(3) 8434 .n(n) 8435 .k(k) 8436 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8437 } 8438 } 8439 } 8440 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,n_div_32_strided_cn)8441 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, n_div_32_strided_cn) { 8442 TEST_REQUIRES_ARM_NEON_DOT; 8443 for (uint32_t n = 64; n <= 96; n += 32) { 8444 for (size_t k = 1; k <= 40; k += 9) { 8445 GemmMicrokernelTester() 8446 .mr(3) 8447 .nr(32) 8448 .kr(4) 8449 .sr(1) 8450 .m(3) 8451 .n(n) 8452 .k(k) 8453 .cn_stride(37) 8454 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8455 } 8456 } 8457 } 8458 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,n_div_32_strided_a)8459 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, n_div_32_strided_a) { 8460 TEST_REQUIRES_ARM_NEON_DOT; 8461 for (uint32_t n = 64; n <= 96; n += 32) { 8462 for (size_t k = 1; k <= 40; k += 9) { 8463 GemmMicrokernelTester() 8464 .mr(3) 8465 .nr(32) 8466 .kr(4) 8467 .sr(1) 8468 .m(3) 8469 .n(n) 8470 .k(k) 8471 .a_stride(43) 8472 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8473 } 8474 } 8475 } 8476 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,n_div_32_subtile)8477 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, n_div_32_subtile) { 8478 TEST_REQUIRES_ARM_NEON_DOT; 8479 for (uint32_t n = 64; n <= 96; n += 32) { 8480 for (size_t k = 1; k <= 40; k += 9) { 8481 for (uint32_t m = 1; m <= 3; m++) { 8482 GemmMicrokernelTester() 8483 .mr(3) 8484 .nr(32) 8485 .kr(4) 8486 .sr(1) 8487 .m(m) 8488 .n(n) 8489 .k(k) 8490 .iterations(1) 8491 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8492 } 8493 } 8494 } 8495 } 8496 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,strided_cm_subtile)8497 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, strided_cm_subtile) { 8498 TEST_REQUIRES_ARM_NEON_DOT; 8499 for (size_t k = 1; k <= 40; k += 9) { 8500 for (uint32_t n = 1; n <= 32; n++) { 8501 for (uint32_t m = 1; m <= 3; m++) { 8502 GemmMicrokernelTester() 8503 .mr(3) 8504 .nr(32) 8505 .kr(4) 8506 .sr(1) 8507 .m(m) 8508 .n(n) 8509 .k(k) 8510 .cm_stride(37) 8511 .iterations(1) 8512 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8513 } 8514 } 8515 } 8516 } 8517 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,qmin)8518 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, qmin) { 8519 TEST_REQUIRES_ARM_NEON_DOT; 8520 GemmMicrokernelTester() 8521 .mr(3) 8522 .nr(32) 8523 .kr(4) 8524 .sr(1) 8525 .m(3) 8526 .n(32) 8527 .k(8) 8528 .qmin(128) 8529 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8530 } 8531 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,qmax)8532 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, qmax) { 8533 TEST_REQUIRES_ARM_NEON_DOT; 8534 GemmMicrokernelTester() 8535 .mr(3) 8536 .nr(32) 8537 .kr(4) 8538 .sr(1) 8539 .m(3) 8540 .n(32) 8541 .k(8) 8542 .qmax(128) 8543 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8544 } 8545 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,strided_cm)8546 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, strided_cm) { 8547 TEST_REQUIRES_ARM_NEON_DOT; 8548 GemmMicrokernelTester() 8549 .mr(3) 8550 .nr(32) 8551 .kr(4) 8552 .sr(1) 8553 .m(3) 8554 .n(32) 8555 .k(8) 8556 .cm_stride(37) 8557 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8558 } 8559 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,no_a_zero_point)8560 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, no_a_zero_point) { 8561 TEST_REQUIRES_ARM_NEON_DOT; 8562 for (size_t k = 1; k <= 40; k += 9) { 8563 GemmMicrokernelTester() 8564 .mr(3) 8565 .nr(32) 8566 .kr(4) 8567 .sr(1) 8568 .m(3) 8569 .n(32) 8570 .k(k) 8571 .a_zero_point(0) 8572 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8573 } 8574 } 8575 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,no_b_zero_point)8576 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, no_b_zero_point) { 8577 TEST_REQUIRES_ARM_NEON_DOT; 8578 for (size_t k = 1; k <= 40; k += 9) { 8579 GemmMicrokernelTester() 8580 .mr(3) 8581 .nr(32) 8582 .kr(4) 8583 .sr(1) 8584 .m(3) 8585 .n(32) 8586 .k(k) 8587 .b_zero_point(0) 8588 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8589 } 8590 } 8591 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT,no_zero_point)8592 TEST(QU8_GEMM_MINMAX_RNDNU_3X32C4__NEONDOT, no_zero_point) { 8593 TEST_REQUIRES_ARM_NEON_DOT; 8594 for (size_t k = 1; k <= 40; k += 9) { 8595 GemmMicrokernelTester() 8596 .mr(3) 8597 .nr(32) 8598 .kr(4) 8599 .sr(1) 8600 .m(3) 8601 .n(32) 8602 .k(k) 8603 .a_zero_point(0) 8604 .b_zero_point(0) 8605 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8606 } 8607 } 8608 #endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64 8609 8610 8611 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_eq_8)8612 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_eq_8) { 8613 TEST_REQUIRES_ARM_NEON; 8614 GemmMicrokernelTester() 8615 .mr(4) 8616 .nr(16) 8617 .kr(1) 8618 .sr(1) 8619 .m(4) 8620 .n(16) 8621 .k(8) 8622 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8623 } 8624 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,strided_cn)8625 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, strided_cn) { 8626 TEST_REQUIRES_ARM_NEON; 8627 GemmMicrokernelTester() 8628 .mr(4) 8629 .nr(16) 8630 .kr(1) 8631 .sr(1) 8632 .m(4) 8633 .n(16) 8634 .k(8) 8635 .cn_stride(19) 8636 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8637 } 8638 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_eq_8_strided_a)8639 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_eq_8_strided_a) { 8640 TEST_REQUIRES_ARM_NEON; 8641 GemmMicrokernelTester() 8642 .mr(4) 8643 .nr(16) 8644 .kr(1) 8645 .sr(1) 8646 .m(4) 8647 .n(16) 8648 .k(8) 8649 .a_stride(11) 8650 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8651 } 8652 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_eq_8_subtile)8653 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_eq_8_subtile) { 8654 TEST_REQUIRES_ARM_NEON; 8655 for (uint32_t n = 1; n <= 16; n++) { 8656 for (uint32_t m = 1; m <= 4; m++) { 8657 GemmMicrokernelTester() 8658 .mr(4) 8659 .nr(16) 8660 .kr(1) 8661 .sr(1) 8662 .m(m) 8663 .n(n) 8664 .k(8) 8665 .iterations(1) 8666 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8667 } 8668 } 8669 } 8670 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_eq_8_subtile_m)8671 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_eq_8_subtile_m) { 8672 TEST_REQUIRES_ARM_NEON; 8673 for (uint32_t m = 1; m <= 4; m++) { 8674 GemmMicrokernelTester() 8675 .mr(4) 8676 .nr(16) 8677 .kr(1) 8678 .sr(1) 8679 .m(m) 8680 .n(16) 8681 .k(8) 8682 .iterations(1) 8683 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8684 } 8685 } 8686 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_eq_8_subtile_n)8687 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_eq_8_subtile_n) { 8688 TEST_REQUIRES_ARM_NEON; 8689 for (uint32_t n = 1; n <= 16; n++) { 8690 GemmMicrokernelTester() 8691 .mr(4) 8692 .nr(16) 8693 .kr(1) 8694 .sr(1) 8695 .m(4) 8696 .n(n) 8697 .k(8) 8698 .iterations(1) 8699 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8700 } 8701 } 8702 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_lt_8)8703 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_lt_8) { 8704 TEST_REQUIRES_ARM_NEON; 8705 for (size_t k = 1; k < 8; k++) { 8706 GemmMicrokernelTester() 8707 .mr(4) 8708 .nr(16) 8709 .kr(1) 8710 .sr(1) 8711 .m(4) 8712 .n(16) 8713 .k(k) 8714 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8715 } 8716 } 8717 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_lt_8_strided_a)8718 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_lt_8_strided_a) { 8719 TEST_REQUIRES_ARM_NEON; 8720 for (size_t k = 1; k < 8; k++) { 8721 GemmMicrokernelTester() 8722 .mr(4) 8723 .nr(16) 8724 .kr(1) 8725 .sr(1) 8726 .m(4) 8727 .n(16) 8728 .k(k) 8729 .a_stride(11) 8730 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8731 } 8732 } 8733 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_lt_8_subtile)8734 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_lt_8_subtile) { 8735 TEST_REQUIRES_ARM_NEON; 8736 for (size_t k = 1; k < 8; k++) { 8737 for (uint32_t n = 1; n <= 16; n++) { 8738 for (uint32_t m = 1; m <= 4; m++) { 8739 GemmMicrokernelTester() 8740 .mr(4) 8741 .nr(16) 8742 .kr(1) 8743 .sr(1) 8744 .m(m) 8745 .n(n) 8746 .k(k) 8747 .iterations(1) 8748 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8749 } 8750 } 8751 } 8752 } 8753 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_gt_8)8754 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_gt_8) { 8755 TEST_REQUIRES_ARM_NEON; 8756 for (size_t k = 9; k < 16; k++) { 8757 GemmMicrokernelTester() 8758 .mr(4) 8759 .nr(16) 8760 .kr(1) 8761 .sr(1) 8762 .m(4) 8763 .n(16) 8764 .k(k) 8765 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8766 } 8767 } 8768 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_gt_8_strided_a)8769 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_gt_8_strided_a) { 8770 TEST_REQUIRES_ARM_NEON; 8771 for (size_t k = 9; k < 16; k++) { 8772 GemmMicrokernelTester() 8773 .mr(4) 8774 .nr(16) 8775 .kr(1) 8776 .sr(1) 8777 .m(4) 8778 .n(16) 8779 .k(k) 8780 .a_stride(19) 8781 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8782 } 8783 } 8784 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_gt_8_subtile)8785 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_gt_8_subtile) { 8786 TEST_REQUIRES_ARM_NEON; 8787 for (size_t k = 9; k < 16; k++) { 8788 for (uint32_t n = 1; n <= 16; n++) { 8789 for (uint32_t m = 1; m <= 4; m++) { 8790 GemmMicrokernelTester() 8791 .mr(4) 8792 .nr(16) 8793 .kr(1) 8794 .sr(1) 8795 .m(m) 8796 .n(n) 8797 .k(k) 8798 .iterations(1) 8799 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8800 } 8801 } 8802 } 8803 } 8804 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_div_8)8805 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_div_8) { 8806 TEST_REQUIRES_ARM_NEON; 8807 for (size_t k = 16; k <= 80; k += 8) { 8808 GemmMicrokernelTester() 8809 .mr(4) 8810 .nr(16) 8811 .kr(1) 8812 .sr(1) 8813 .m(4) 8814 .n(16) 8815 .k(k) 8816 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8817 } 8818 } 8819 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_div_8_strided_a)8820 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_div_8_strided_a) { 8821 TEST_REQUIRES_ARM_NEON; 8822 for (size_t k = 16; k <= 80; k += 8) { 8823 GemmMicrokernelTester() 8824 .mr(4) 8825 .nr(16) 8826 .kr(1) 8827 .sr(1) 8828 .m(4) 8829 .n(16) 8830 .k(k) 8831 .a_stride(83) 8832 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8833 } 8834 } 8835 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,k_div_8_subtile)8836 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, k_div_8_subtile) { 8837 TEST_REQUIRES_ARM_NEON; 8838 for (size_t k = 16; k <= 80; k += 8) { 8839 for (uint32_t n = 1; n <= 16; n++) { 8840 for (uint32_t m = 1; m <= 4; m++) { 8841 GemmMicrokernelTester() 8842 .mr(4) 8843 .nr(16) 8844 .kr(1) 8845 .sr(1) 8846 .m(m) 8847 .n(n) 8848 .k(k) 8849 .iterations(1) 8850 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8851 } 8852 } 8853 } 8854 } 8855 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,n_gt_16)8856 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, n_gt_16) { 8857 TEST_REQUIRES_ARM_NEON; 8858 for (uint32_t n = 17; n < 32; n++) { 8859 for (size_t k = 1; k <= 40; k += 9) { 8860 GemmMicrokernelTester() 8861 .mr(4) 8862 .nr(16) 8863 .kr(1) 8864 .sr(1) 8865 .m(4) 8866 .n(n) 8867 .k(k) 8868 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8869 } 8870 } 8871 } 8872 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,n_gt_16_strided_cn)8873 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, n_gt_16_strided_cn) { 8874 TEST_REQUIRES_ARM_NEON; 8875 for (uint32_t n = 17; n < 32; n++) { 8876 for (size_t k = 1; k <= 40; k += 9) { 8877 GemmMicrokernelTester() 8878 .mr(4) 8879 .nr(16) 8880 .kr(1) 8881 .sr(1) 8882 .m(4) 8883 .n(n) 8884 .k(k) 8885 .cn_stride(19) 8886 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8887 } 8888 } 8889 } 8890 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,n_gt_16_strided_a)8891 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, n_gt_16_strided_a) { 8892 TEST_REQUIRES_ARM_NEON; 8893 for (uint32_t n = 17; n < 32; n++) { 8894 for (size_t k = 1; k <= 40; k += 9) { 8895 GemmMicrokernelTester() 8896 .mr(4) 8897 .nr(16) 8898 .kr(1) 8899 .sr(1) 8900 .m(4) 8901 .n(n) 8902 .k(k) 8903 .a_stride(43) 8904 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8905 } 8906 } 8907 } 8908 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,n_gt_16_subtile)8909 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, n_gt_16_subtile) { 8910 TEST_REQUIRES_ARM_NEON; 8911 for (uint32_t n = 17; n < 32; n++) { 8912 for (size_t k = 1; k <= 40; k += 9) { 8913 for (uint32_t m = 1; m <= 4; m++) { 8914 GemmMicrokernelTester() 8915 .mr(4) 8916 .nr(16) 8917 .kr(1) 8918 .sr(1) 8919 .m(m) 8920 .n(n) 8921 .k(k) 8922 .iterations(1) 8923 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8924 } 8925 } 8926 } 8927 } 8928 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,n_div_16)8929 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, n_div_16) { 8930 TEST_REQUIRES_ARM_NEON; 8931 for (uint32_t n = 32; n <= 48; n += 16) { 8932 for (size_t k = 1; k <= 40; k += 9) { 8933 GemmMicrokernelTester() 8934 .mr(4) 8935 .nr(16) 8936 .kr(1) 8937 .sr(1) 8938 .m(4) 8939 .n(n) 8940 .k(k) 8941 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8942 } 8943 } 8944 } 8945 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,n_div_16_strided_cn)8946 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, n_div_16_strided_cn) { 8947 TEST_REQUIRES_ARM_NEON; 8948 for (uint32_t n = 32; n <= 48; n += 16) { 8949 for (size_t k = 1; k <= 40; k += 9) { 8950 GemmMicrokernelTester() 8951 .mr(4) 8952 .nr(16) 8953 .kr(1) 8954 .sr(1) 8955 .m(4) 8956 .n(n) 8957 .k(k) 8958 .cn_stride(19) 8959 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8960 } 8961 } 8962 } 8963 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,n_div_16_strided_a)8964 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, n_div_16_strided_a) { 8965 TEST_REQUIRES_ARM_NEON; 8966 for (uint32_t n = 32; n <= 48; n += 16) { 8967 for (size_t k = 1; k <= 40; k += 9) { 8968 GemmMicrokernelTester() 8969 .mr(4) 8970 .nr(16) 8971 .kr(1) 8972 .sr(1) 8973 .m(4) 8974 .n(n) 8975 .k(k) 8976 .a_stride(43) 8977 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8978 } 8979 } 8980 } 8981 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,n_div_16_subtile)8982 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, n_div_16_subtile) { 8983 TEST_REQUIRES_ARM_NEON; 8984 for (uint32_t n = 32; n <= 48; n += 16) { 8985 for (size_t k = 1; k <= 40; k += 9) { 8986 for (uint32_t m = 1; m <= 4; m++) { 8987 GemmMicrokernelTester() 8988 .mr(4) 8989 .nr(16) 8990 .kr(1) 8991 .sr(1) 8992 .m(m) 8993 .n(n) 8994 .k(k) 8995 .iterations(1) 8996 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 8997 } 8998 } 8999 } 9000 } 9001 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,strided_cm_subtile)9002 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, strided_cm_subtile) { 9003 TEST_REQUIRES_ARM_NEON; 9004 for (size_t k = 1; k <= 40; k += 9) { 9005 for (uint32_t n = 1; n <= 16; n++) { 9006 for (uint32_t m = 1; m <= 4; m++) { 9007 GemmMicrokernelTester() 9008 .mr(4) 9009 .nr(16) 9010 .kr(1) 9011 .sr(1) 9012 .m(m) 9013 .n(n) 9014 .k(k) 9015 .cm_stride(19) 9016 .iterations(1) 9017 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9018 } 9019 } 9020 } 9021 } 9022 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,qmin)9023 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, qmin) { 9024 TEST_REQUIRES_ARM_NEON; 9025 GemmMicrokernelTester() 9026 .mr(4) 9027 .nr(16) 9028 .kr(1) 9029 .sr(1) 9030 .m(4) 9031 .n(16) 9032 .k(8) 9033 .qmin(128) 9034 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9035 } 9036 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,qmax)9037 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, qmax) { 9038 TEST_REQUIRES_ARM_NEON; 9039 GemmMicrokernelTester() 9040 .mr(4) 9041 .nr(16) 9042 .kr(1) 9043 .sr(1) 9044 .m(4) 9045 .n(16) 9046 .k(8) 9047 .qmax(128) 9048 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9049 } 9050 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,strided_cm)9051 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, strided_cm) { 9052 TEST_REQUIRES_ARM_NEON; 9053 GemmMicrokernelTester() 9054 .mr(4) 9055 .nr(16) 9056 .kr(1) 9057 .sr(1) 9058 .m(4) 9059 .n(16) 9060 .k(8) 9061 .cm_stride(19) 9062 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9063 } 9064 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,no_a_zero_point)9065 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, no_a_zero_point) { 9066 TEST_REQUIRES_ARM_NEON; 9067 for (size_t k = 1; k <= 40; k += 9) { 9068 GemmMicrokernelTester() 9069 .mr(4) 9070 .nr(16) 9071 .kr(1) 9072 .sr(1) 9073 .m(4) 9074 .n(16) 9075 .k(k) 9076 .a_zero_point(0) 9077 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9078 } 9079 } 9080 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,no_b_zero_point)9081 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, no_b_zero_point) { 9082 TEST_REQUIRES_ARM_NEON; 9083 for (size_t k = 1; k <= 40; k += 9) { 9084 GemmMicrokernelTester() 9085 .mr(4) 9086 .nr(16) 9087 .kr(1) 9088 .sr(1) 9089 .m(4) 9090 .n(16) 9091 .k(k) 9092 .b_zero_point(0) 9093 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9094 } 9095 } 9096 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75,no_zero_point)9097 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A75, no_zero_point) { 9098 TEST_REQUIRES_ARM_NEON; 9099 for (size_t k = 1; k <= 40; k += 9) { 9100 GemmMicrokernelTester() 9101 .mr(4) 9102 .nr(16) 9103 .kr(1) 9104 .sr(1) 9105 .m(4) 9106 .n(16) 9107 .k(k) 9108 .a_zero_point(0) 9109 .b_zero_point(0) 9110 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9111 } 9112 } 9113 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 9114 9115 9116 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_eq_8)9117 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8) { 9118 TEST_REQUIRES_ARM_NEON; 9119 GemmMicrokernelTester() 9120 .mr(4) 9121 .nr(16) 9122 .kr(1) 9123 .sr(1) 9124 .m(4) 9125 .n(16) 9126 .k(8) 9127 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9128 } 9129 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,strided_cn)9130 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cn) { 9131 TEST_REQUIRES_ARM_NEON; 9132 GemmMicrokernelTester() 9133 .mr(4) 9134 .nr(16) 9135 .kr(1) 9136 .sr(1) 9137 .m(4) 9138 .n(16) 9139 .k(8) 9140 .cn_stride(19) 9141 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9142 } 9143 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_eq_8_strided_a)9144 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_strided_a) { 9145 TEST_REQUIRES_ARM_NEON; 9146 GemmMicrokernelTester() 9147 .mr(4) 9148 .nr(16) 9149 .kr(1) 9150 .sr(1) 9151 .m(4) 9152 .n(16) 9153 .k(8) 9154 .a_stride(11) 9155 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9156 } 9157 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_eq_8_subtile)9158 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile) { 9159 TEST_REQUIRES_ARM_NEON; 9160 for (uint32_t n = 1; n <= 16; n++) { 9161 for (uint32_t m = 1; m <= 4; m++) { 9162 GemmMicrokernelTester() 9163 .mr(4) 9164 .nr(16) 9165 .kr(1) 9166 .sr(1) 9167 .m(m) 9168 .n(n) 9169 .k(8) 9170 .iterations(1) 9171 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9172 } 9173 } 9174 } 9175 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_eq_8_subtile_m)9176 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_m) { 9177 TEST_REQUIRES_ARM_NEON; 9178 for (uint32_t m = 1; m <= 4; m++) { 9179 GemmMicrokernelTester() 9180 .mr(4) 9181 .nr(16) 9182 .kr(1) 9183 .sr(1) 9184 .m(m) 9185 .n(16) 9186 .k(8) 9187 .iterations(1) 9188 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9189 } 9190 } 9191 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_eq_8_subtile_n)9192 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_n) { 9193 TEST_REQUIRES_ARM_NEON; 9194 for (uint32_t n = 1; n <= 16; n++) { 9195 GemmMicrokernelTester() 9196 .mr(4) 9197 .nr(16) 9198 .kr(1) 9199 .sr(1) 9200 .m(4) 9201 .n(n) 9202 .k(8) 9203 .iterations(1) 9204 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9205 } 9206 } 9207 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_lt_8)9208 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_lt_8) { 9209 TEST_REQUIRES_ARM_NEON; 9210 for (size_t k = 1; k < 8; k++) { 9211 GemmMicrokernelTester() 9212 .mr(4) 9213 .nr(16) 9214 .kr(1) 9215 .sr(1) 9216 .m(4) 9217 .n(16) 9218 .k(k) 9219 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9220 } 9221 } 9222 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_lt_8_strided_a)9223 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_lt_8_strided_a) { 9224 TEST_REQUIRES_ARM_NEON; 9225 for (size_t k = 1; k < 8; k++) { 9226 GemmMicrokernelTester() 9227 .mr(4) 9228 .nr(16) 9229 .kr(1) 9230 .sr(1) 9231 .m(4) 9232 .n(16) 9233 .k(k) 9234 .a_stride(11) 9235 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9236 } 9237 } 9238 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_lt_8_subtile)9239 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_lt_8_subtile) { 9240 TEST_REQUIRES_ARM_NEON; 9241 for (size_t k = 1; k < 8; k++) { 9242 for (uint32_t n = 1; n <= 16; n++) { 9243 for (uint32_t m = 1; m <= 4; m++) { 9244 GemmMicrokernelTester() 9245 .mr(4) 9246 .nr(16) 9247 .kr(1) 9248 .sr(1) 9249 .m(m) 9250 .n(n) 9251 .k(k) 9252 .iterations(1) 9253 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9254 } 9255 } 9256 } 9257 } 9258 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_gt_8)9259 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_gt_8) { 9260 TEST_REQUIRES_ARM_NEON; 9261 for (size_t k = 9; k < 16; k++) { 9262 GemmMicrokernelTester() 9263 .mr(4) 9264 .nr(16) 9265 .kr(1) 9266 .sr(1) 9267 .m(4) 9268 .n(16) 9269 .k(k) 9270 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9271 } 9272 } 9273 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_gt_8_strided_a)9274 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_gt_8_strided_a) { 9275 TEST_REQUIRES_ARM_NEON; 9276 for (size_t k = 9; k < 16; k++) { 9277 GemmMicrokernelTester() 9278 .mr(4) 9279 .nr(16) 9280 .kr(1) 9281 .sr(1) 9282 .m(4) 9283 .n(16) 9284 .k(k) 9285 .a_stride(19) 9286 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9287 } 9288 } 9289 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_gt_8_subtile)9290 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_gt_8_subtile) { 9291 TEST_REQUIRES_ARM_NEON; 9292 for (size_t k = 9; k < 16; k++) { 9293 for (uint32_t n = 1; n <= 16; n++) { 9294 for (uint32_t m = 1; m <= 4; m++) { 9295 GemmMicrokernelTester() 9296 .mr(4) 9297 .nr(16) 9298 .kr(1) 9299 .sr(1) 9300 .m(m) 9301 .n(n) 9302 .k(k) 9303 .iterations(1) 9304 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9305 } 9306 } 9307 } 9308 } 9309 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_div_8)9310 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_div_8) { 9311 TEST_REQUIRES_ARM_NEON; 9312 for (size_t k = 16; k <= 80; k += 8) { 9313 GemmMicrokernelTester() 9314 .mr(4) 9315 .nr(16) 9316 .kr(1) 9317 .sr(1) 9318 .m(4) 9319 .n(16) 9320 .k(k) 9321 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9322 } 9323 } 9324 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_div_8_strided_a)9325 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_div_8_strided_a) { 9326 TEST_REQUIRES_ARM_NEON; 9327 for (size_t k = 16; k <= 80; k += 8) { 9328 GemmMicrokernelTester() 9329 .mr(4) 9330 .nr(16) 9331 .kr(1) 9332 .sr(1) 9333 .m(4) 9334 .n(16) 9335 .k(k) 9336 .a_stride(83) 9337 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9338 } 9339 } 9340 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,k_div_8_subtile)9341 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_div_8_subtile) { 9342 TEST_REQUIRES_ARM_NEON; 9343 for (size_t k = 16; k <= 80; k += 8) { 9344 for (uint32_t n = 1; n <= 16; n++) { 9345 for (uint32_t m = 1; m <= 4; m++) { 9346 GemmMicrokernelTester() 9347 .mr(4) 9348 .nr(16) 9349 .kr(1) 9350 .sr(1) 9351 .m(m) 9352 .n(n) 9353 .k(k) 9354 .iterations(1) 9355 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9356 } 9357 } 9358 } 9359 } 9360 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,n_gt_16)9361 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16) { 9362 TEST_REQUIRES_ARM_NEON; 9363 for (uint32_t n = 17; n < 32; n++) { 9364 for (size_t k = 1; k <= 40; k += 9) { 9365 GemmMicrokernelTester() 9366 .mr(4) 9367 .nr(16) 9368 .kr(1) 9369 .sr(1) 9370 .m(4) 9371 .n(n) 9372 .k(k) 9373 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9374 } 9375 } 9376 } 9377 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,n_gt_16_strided_cn)9378 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_strided_cn) { 9379 TEST_REQUIRES_ARM_NEON; 9380 for (uint32_t n = 17; n < 32; n++) { 9381 for (size_t k = 1; k <= 40; k += 9) { 9382 GemmMicrokernelTester() 9383 .mr(4) 9384 .nr(16) 9385 .kr(1) 9386 .sr(1) 9387 .m(4) 9388 .n(n) 9389 .k(k) 9390 .cn_stride(19) 9391 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9392 } 9393 } 9394 } 9395 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,n_gt_16_strided_a)9396 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_strided_a) { 9397 TEST_REQUIRES_ARM_NEON; 9398 for (uint32_t n = 17; n < 32; n++) { 9399 for (size_t k = 1; k <= 40; k += 9) { 9400 GemmMicrokernelTester() 9401 .mr(4) 9402 .nr(16) 9403 .kr(1) 9404 .sr(1) 9405 .m(4) 9406 .n(n) 9407 .k(k) 9408 .a_stride(43) 9409 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9410 } 9411 } 9412 } 9413 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,n_gt_16_subtile)9414 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_subtile) { 9415 TEST_REQUIRES_ARM_NEON; 9416 for (uint32_t n = 17; n < 32; n++) { 9417 for (size_t k = 1; k <= 40; k += 9) { 9418 for (uint32_t m = 1; m <= 4; m++) { 9419 GemmMicrokernelTester() 9420 .mr(4) 9421 .nr(16) 9422 .kr(1) 9423 .sr(1) 9424 .m(m) 9425 .n(n) 9426 .k(k) 9427 .iterations(1) 9428 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9429 } 9430 } 9431 } 9432 } 9433 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,n_div_16)9434 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16) { 9435 TEST_REQUIRES_ARM_NEON; 9436 for (uint32_t n = 32; n <= 48; n += 16) { 9437 for (size_t k = 1; k <= 40; k += 9) { 9438 GemmMicrokernelTester() 9439 .mr(4) 9440 .nr(16) 9441 .kr(1) 9442 .sr(1) 9443 .m(4) 9444 .n(n) 9445 .k(k) 9446 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9447 } 9448 } 9449 } 9450 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,n_div_16_strided_cn)9451 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_strided_cn) { 9452 TEST_REQUIRES_ARM_NEON; 9453 for (uint32_t n = 32; n <= 48; n += 16) { 9454 for (size_t k = 1; k <= 40; k += 9) { 9455 GemmMicrokernelTester() 9456 .mr(4) 9457 .nr(16) 9458 .kr(1) 9459 .sr(1) 9460 .m(4) 9461 .n(n) 9462 .k(k) 9463 .cn_stride(19) 9464 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9465 } 9466 } 9467 } 9468 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,n_div_16_strided_a)9469 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_strided_a) { 9470 TEST_REQUIRES_ARM_NEON; 9471 for (uint32_t n = 32; n <= 48; n += 16) { 9472 for (size_t k = 1; k <= 40; k += 9) { 9473 GemmMicrokernelTester() 9474 .mr(4) 9475 .nr(16) 9476 .kr(1) 9477 .sr(1) 9478 .m(4) 9479 .n(n) 9480 .k(k) 9481 .a_stride(43) 9482 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9483 } 9484 } 9485 } 9486 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,n_div_16_subtile)9487 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_subtile) { 9488 TEST_REQUIRES_ARM_NEON; 9489 for (uint32_t n = 32; n <= 48; n += 16) { 9490 for (size_t k = 1; k <= 40; k += 9) { 9491 for (uint32_t m = 1; m <= 4; m++) { 9492 GemmMicrokernelTester() 9493 .mr(4) 9494 .nr(16) 9495 .kr(1) 9496 .sr(1) 9497 .m(m) 9498 .n(n) 9499 .k(k) 9500 .iterations(1) 9501 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9502 } 9503 } 9504 } 9505 } 9506 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,strided_cm_subtile)9507 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cm_subtile) { 9508 TEST_REQUIRES_ARM_NEON; 9509 for (size_t k = 1; k <= 40; k += 9) { 9510 for (uint32_t n = 1; n <= 16; n++) { 9511 for (uint32_t m = 1; m <= 4; m++) { 9512 GemmMicrokernelTester() 9513 .mr(4) 9514 .nr(16) 9515 .kr(1) 9516 .sr(1) 9517 .m(m) 9518 .n(n) 9519 .k(k) 9520 .cm_stride(19) 9521 .iterations(1) 9522 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9523 } 9524 } 9525 } 9526 } 9527 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,qmin)9528 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, qmin) { 9529 TEST_REQUIRES_ARM_NEON; 9530 GemmMicrokernelTester() 9531 .mr(4) 9532 .nr(16) 9533 .kr(1) 9534 .sr(1) 9535 .m(4) 9536 .n(16) 9537 .k(8) 9538 .qmin(128) 9539 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9540 } 9541 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,qmax)9542 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, qmax) { 9543 TEST_REQUIRES_ARM_NEON; 9544 GemmMicrokernelTester() 9545 .mr(4) 9546 .nr(16) 9547 .kr(1) 9548 .sr(1) 9549 .m(4) 9550 .n(16) 9551 .k(8) 9552 .qmax(128) 9553 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9554 } 9555 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,strided_cm)9556 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cm) { 9557 TEST_REQUIRES_ARM_NEON; 9558 GemmMicrokernelTester() 9559 .mr(4) 9560 .nr(16) 9561 .kr(1) 9562 .sr(1) 9563 .m(4) 9564 .n(16) 9565 .k(8) 9566 .cm_stride(19) 9567 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9568 } 9569 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,no_a_zero_point)9570 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, no_a_zero_point) { 9571 TEST_REQUIRES_ARM_NEON; 9572 for (size_t k = 1; k <= 40; k += 9) { 9573 GemmMicrokernelTester() 9574 .mr(4) 9575 .nr(16) 9576 .kr(1) 9577 .sr(1) 9578 .m(4) 9579 .n(16) 9580 .k(k) 9581 .a_zero_point(0) 9582 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9583 } 9584 } 9585 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,no_b_zero_point)9586 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, no_b_zero_point) { 9587 TEST_REQUIRES_ARM_NEON; 9588 for (size_t k = 1; k <= 40; k += 9) { 9589 GemmMicrokernelTester() 9590 .mr(4) 9591 .nr(16) 9592 .kr(1) 9593 .sr(1) 9594 .m(4) 9595 .n(16) 9596 .k(k) 9597 .b_zero_point(0) 9598 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9599 } 9600 } 9601 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53,no_zero_point)9602 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, no_zero_point) { 9603 TEST_REQUIRES_ARM_NEON; 9604 for (size_t k = 1; k <= 40; k += 9) { 9605 GemmMicrokernelTester() 9606 .mr(4) 9607 .nr(16) 9608 .kr(1) 9609 .sr(1) 9610 .m(4) 9611 .n(16) 9612 .k(k) 9613 .a_zero_point(0) 9614 .b_zero_point(0) 9615 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9616 } 9617 } 9618 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 9619 9620 9621 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_eq_8)9622 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) { 9623 TEST_REQUIRES_ARM_NEON; 9624 GemmMicrokernelTester() 9625 .mr(4) 9626 .nr(16) 9627 .kr(1) 9628 .sr(1) 9629 .m(4) 9630 .n(16) 9631 .k(8) 9632 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9633 } 9634 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,strided_cn)9635 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) { 9636 TEST_REQUIRES_ARM_NEON; 9637 GemmMicrokernelTester() 9638 .mr(4) 9639 .nr(16) 9640 .kr(1) 9641 .sr(1) 9642 .m(4) 9643 .n(16) 9644 .k(8) 9645 .cn_stride(19) 9646 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9647 } 9648 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_eq_8_strided_a)9649 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) { 9650 TEST_REQUIRES_ARM_NEON; 9651 GemmMicrokernelTester() 9652 .mr(4) 9653 .nr(16) 9654 .kr(1) 9655 .sr(1) 9656 .m(4) 9657 .n(16) 9658 .k(8) 9659 .a_stride(11) 9660 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9661 } 9662 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_eq_8_subtile)9663 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) { 9664 TEST_REQUIRES_ARM_NEON; 9665 for (uint32_t n = 1; n <= 16; n++) { 9666 for (uint32_t m = 1; m <= 4; m++) { 9667 GemmMicrokernelTester() 9668 .mr(4) 9669 .nr(16) 9670 .kr(1) 9671 .sr(1) 9672 .m(m) 9673 .n(n) 9674 .k(8) 9675 .iterations(1) 9676 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9677 } 9678 } 9679 } 9680 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_eq_8_subtile_m)9681 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) { 9682 TEST_REQUIRES_ARM_NEON; 9683 for (uint32_t m = 1; m <= 4; m++) { 9684 GemmMicrokernelTester() 9685 .mr(4) 9686 .nr(16) 9687 .kr(1) 9688 .sr(1) 9689 .m(m) 9690 .n(16) 9691 .k(8) 9692 .iterations(1) 9693 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9694 } 9695 } 9696 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_eq_8_subtile_n)9697 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) { 9698 TEST_REQUIRES_ARM_NEON; 9699 for (uint32_t n = 1; n <= 16; n++) { 9700 GemmMicrokernelTester() 9701 .mr(4) 9702 .nr(16) 9703 .kr(1) 9704 .sr(1) 9705 .m(4) 9706 .n(n) 9707 .k(8) 9708 .iterations(1) 9709 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9710 } 9711 } 9712 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_lt_8)9713 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) { 9714 TEST_REQUIRES_ARM_NEON; 9715 for (size_t k = 1; k < 8; k++) { 9716 GemmMicrokernelTester() 9717 .mr(4) 9718 .nr(16) 9719 .kr(1) 9720 .sr(1) 9721 .m(4) 9722 .n(16) 9723 .k(k) 9724 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9725 } 9726 } 9727 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_lt_8_strided_a)9728 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) { 9729 TEST_REQUIRES_ARM_NEON; 9730 for (size_t k = 1; k < 8; k++) { 9731 GemmMicrokernelTester() 9732 .mr(4) 9733 .nr(16) 9734 .kr(1) 9735 .sr(1) 9736 .m(4) 9737 .n(16) 9738 .k(k) 9739 .a_stride(11) 9740 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9741 } 9742 } 9743 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_lt_8_subtile)9744 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) { 9745 TEST_REQUIRES_ARM_NEON; 9746 for (size_t k = 1; k < 8; k++) { 9747 for (uint32_t n = 1; n <= 16; n++) { 9748 for (uint32_t m = 1; m <= 4; m++) { 9749 GemmMicrokernelTester() 9750 .mr(4) 9751 .nr(16) 9752 .kr(1) 9753 .sr(1) 9754 .m(m) 9755 .n(n) 9756 .k(k) 9757 .iterations(1) 9758 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9759 } 9760 } 9761 } 9762 } 9763 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_gt_8)9764 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) { 9765 TEST_REQUIRES_ARM_NEON; 9766 for (size_t k = 9; k < 16; k++) { 9767 GemmMicrokernelTester() 9768 .mr(4) 9769 .nr(16) 9770 .kr(1) 9771 .sr(1) 9772 .m(4) 9773 .n(16) 9774 .k(k) 9775 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9776 } 9777 } 9778 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_gt_8_strided_a)9779 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) { 9780 TEST_REQUIRES_ARM_NEON; 9781 for (size_t k = 9; k < 16; k++) { 9782 GemmMicrokernelTester() 9783 .mr(4) 9784 .nr(16) 9785 .kr(1) 9786 .sr(1) 9787 .m(4) 9788 .n(16) 9789 .k(k) 9790 .a_stride(19) 9791 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9792 } 9793 } 9794 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_gt_8_subtile)9795 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) { 9796 TEST_REQUIRES_ARM_NEON; 9797 for (size_t k = 9; k < 16; k++) { 9798 for (uint32_t n = 1; n <= 16; n++) { 9799 for (uint32_t m = 1; m <= 4; m++) { 9800 GemmMicrokernelTester() 9801 .mr(4) 9802 .nr(16) 9803 .kr(1) 9804 .sr(1) 9805 .m(m) 9806 .n(n) 9807 .k(k) 9808 .iterations(1) 9809 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9810 } 9811 } 9812 } 9813 } 9814 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_div_8)9815 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) { 9816 TEST_REQUIRES_ARM_NEON; 9817 for (size_t k = 16; k <= 80; k += 8) { 9818 GemmMicrokernelTester() 9819 .mr(4) 9820 .nr(16) 9821 .kr(1) 9822 .sr(1) 9823 .m(4) 9824 .n(16) 9825 .k(k) 9826 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9827 } 9828 } 9829 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_div_8_strided_a)9830 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) { 9831 TEST_REQUIRES_ARM_NEON; 9832 for (size_t k = 16; k <= 80; k += 8) { 9833 GemmMicrokernelTester() 9834 .mr(4) 9835 .nr(16) 9836 .kr(1) 9837 .sr(1) 9838 .m(4) 9839 .n(16) 9840 .k(k) 9841 .a_stride(83) 9842 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9843 } 9844 } 9845 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,k_div_8_subtile)9846 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) { 9847 TEST_REQUIRES_ARM_NEON; 9848 for (size_t k = 16; k <= 80; k += 8) { 9849 for (uint32_t n = 1; n <= 16; n++) { 9850 for (uint32_t m = 1; m <= 4; m++) { 9851 GemmMicrokernelTester() 9852 .mr(4) 9853 .nr(16) 9854 .kr(1) 9855 .sr(1) 9856 .m(m) 9857 .n(n) 9858 .k(k) 9859 .iterations(1) 9860 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9861 } 9862 } 9863 } 9864 } 9865 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,n_gt_16)9866 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) { 9867 TEST_REQUIRES_ARM_NEON; 9868 for (uint32_t n = 17; n < 32; n++) { 9869 for (size_t k = 1; k <= 40; k += 9) { 9870 GemmMicrokernelTester() 9871 .mr(4) 9872 .nr(16) 9873 .kr(1) 9874 .sr(1) 9875 .m(4) 9876 .n(n) 9877 .k(k) 9878 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9879 } 9880 } 9881 } 9882 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,n_gt_16_strided_cn)9883 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) { 9884 TEST_REQUIRES_ARM_NEON; 9885 for (uint32_t n = 17; n < 32; n++) { 9886 for (size_t k = 1; k <= 40; k += 9) { 9887 GemmMicrokernelTester() 9888 .mr(4) 9889 .nr(16) 9890 .kr(1) 9891 .sr(1) 9892 .m(4) 9893 .n(n) 9894 .k(k) 9895 .cn_stride(19) 9896 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9897 } 9898 } 9899 } 9900 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,n_gt_16_strided_a)9901 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_a) { 9902 TEST_REQUIRES_ARM_NEON; 9903 for (uint32_t n = 17; n < 32; n++) { 9904 for (size_t k = 1; k <= 40; k += 9) { 9905 GemmMicrokernelTester() 9906 .mr(4) 9907 .nr(16) 9908 .kr(1) 9909 .sr(1) 9910 .m(4) 9911 .n(n) 9912 .k(k) 9913 .a_stride(43) 9914 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9915 } 9916 } 9917 } 9918 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,n_gt_16_subtile)9919 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) { 9920 TEST_REQUIRES_ARM_NEON; 9921 for (uint32_t n = 17; n < 32; n++) { 9922 for (size_t k = 1; k <= 40; k += 9) { 9923 for (uint32_t m = 1; m <= 4; m++) { 9924 GemmMicrokernelTester() 9925 .mr(4) 9926 .nr(16) 9927 .kr(1) 9928 .sr(1) 9929 .m(m) 9930 .n(n) 9931 .k(k) 9932 .iterations(1) 9933 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9934 } 9935 } 9936 } 9937 } 9938 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,n_div_16)9939 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) { 9940 TEST_REQUIRES_ARM_NEON; 9941 for (uint32_t n = 32; n <= 48; n += 16) { 9942 for (size_t k = 1; k <= 40; k += 9) { 9943 GemmMicrokernelTester() 9944 .mr(4) 9945 .nr(16) 9946 .kr(1) 9947 .sr(1) 9948 .m(4) 9949 .n(n) 9950 .k(k) 9951 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9952 } 9953 } 9954 } 9955 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,n_div_16_strided_cn)9956 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) { 9957 TEST_REQUIRES_ARM_NEON; 9958 for (uint32_t n = 32; n <= 48; n += 16) { 9959 for (size_t k = 1; k <= 40; k += 9) { 9960 GemmMicrokernelTester() 9961 .mr(4) 9962 .nr(16) 9963 .kr(1) 9964 .sr(1) 9965 .m(4) 9966 .n(n) 9967 .k(k) 9968 .cn_stride(19) 9969 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9970 } 9971 } 9972 } 9973 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,n_div_16_strided_a)9974 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_a) { 9975 TEST_REQUIRES_ARM_NEON; 9976 for (uint32_t n = 32; n <= 48; n += 16) { 9977 for (size_t k = 1; k <= 40; k += 9) { 9978 GemmMicrokernelTester() 9979 .mr(4) 9980 .nr(16) 9981 .kr(1) 9982 .sr(1) 9983 .m(4) 9984 .n(n) 9985 .k(k) 9986 .a_stride(43) 9987 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 9988 } 9989 } 9990 } 9991 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,n_div_16_subtile)9992 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) { 9993 TEST_REQUIRES_ARM_NEON; 9994 for (uint32_t n = 32; n <= 48; n += 16) { 9995 for (size_t k = 1; k <= 40; k += 9) { 9996 for (uint32_t m = 1; m <= 4; m++) { 9997 GemmMicrokernelTester() 9998 .mr(4) 9999 .nr(16) 10000 .kr(1) 10001 .sr(1) 10002 .m(m) 10003 .n(n) 10004 .k(k) 10005 .iterations(1) 10006 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10007 } 10008 } 10009 } 10010 } 10011 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,strided_cm_subtile)10012 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) { 10013 TEST_REQUIRES_ARM_NEON; 10014 for (size_t k = 1; k <= 40; k += 9) { 10015 for (uint32_t n = 1; n <= 16; n++) { 10016 for (uint32_t m = 1; m <= 4; m++) { 10017 GemmMicrokernelTester() 10018 .mr(4) 10019 .nr(16) 10020 .kr(1) 10021 .sr(1) 10022 .m(m) 10023 .n(n) 10024 .k(k) 10025 .cm_stride(19) 10026 .iterations(1) 10027 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10028 } 10029 } 10030 } 10031 } 10032 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,qmin)10033 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) { 10034 TEST_REQUIRES_ARM_NEON; 10035 GemmMicrokernelTester() 10036 .mr(4) 10037 .nr(16) 10038 .kr(1) 10039 .sr(1) 10040 .m(4) 10041 .n(16) 10042 .k(8) 10043 .qmin(128) 10044 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10045 } 10046 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,qmax)10047 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) { 10048 TEST_REQUIRES_ARM_NEON; 10049 GemmMicrokernelTester() 10050 .mr(4) 10051 .nr(16) 10052 .kr(1) 10053 .sr(1) 10054 .m(4) 10055 .n(16) 10056 .k(8) 10057 .qmax(128) 10058 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10059 } 10060 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,strided_cm)10061 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) { 10062 TEST_REQUIRES_ARM_NEON; 10063 GemmMicrokernelTester() 10064 .mr(4) 10065 .nr(16) 10066 .kr(1) 10067 .sr(1) 10068 .m(4) 10069 .n(16) 10070 .k(8) 10071 .cm_stride(19) 10072 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10073 } 10074 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,no_a_zero_point)10075 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, no_a_zero_point) { 10076 TEST_REQUIRES_ARM_NEON; 10077 for (size_t k = 1; k <= 40; k += 9) { 10078 GemmMicrokernelTester() 10079 .mr(4) 10080 .nr(16) 10081 .kr(1) 10082 .sr(1) 10083 .m(4) 10084 .n(16) 10085 .k(k) 10086 .a_zero_point(0) 10087 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10088 } 10089 } 10090 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,no_b_zero_point)10091 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, no_b_zero_point) { 10092 TEST_REQUIRES_ARM_NEON; 10093 for (size_t k = 1; k <= 40; k += 9) { 10094 GemmMicrokernelTester() 10095 .mr(4) 10096 .nr(16) 10097 .kr(1) 10098 .sr(1) 10099 .m(4) 10100 .n(16) 10101 .k(k) 10102 .b_zero_point(0) 10103 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10104 } 10105 } 10106 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64,no_zero_point)10107 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, no_zero_point) { 10108 TEST_REQUIRES_ARM_NEON; 10109 for (size_t k = 1; k <= 40; k += 9) { 10110 GemmMicrokernelTester() 10111 .mr(4) 10112 .nr(16) 10113 .kr(1) 10114 .sr(1) 10115 .m(4) 10116 .n(16) 10117 .k(k) 10118 .a_zero_point(0) 10119 .b_zero_point(0) 10120 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10121 } 10122 } 10123 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 10124 10125 10126 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_eq_8)10127 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) { 10128 TEST_REQUIRES_ARM_NEON; 10129 GemmMicrokernelTester() 10130 .mr(4) 10131 .nr(16) 10132 .kr(1) 10133 .sr(1) 10134 .m(4) 10135 .n(16) 10136 .k(8) 10137 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10138 } 10139 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,strided_cn)10140 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) { 10141 TEST_REQUIRES_ARM_NEON; 10142 GemmMicrokernelTester() 10143 .mr(4) 10144 .nr(16) 10145 .kr(1) 10146 .sr(1) 10147 .m(4) 10148 .n(16) 10149 .k(8) 10150 .cn_stride(19) 10151 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10152 } 10153 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_eq_8_strided_a)10154 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) { 10155 TEST_REQUIRES_ARM_NEON; 10156 GemmMicrokernelTester() 10157 .mr(4) 10158 .nr(16) 10159 .kr(1) 10160 .sr(1) 10161 .m(4) 10162 .n(16) 10163 .k(8) 10164 .a_stride(11) 10165 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10166 } 10167 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_eq_8_subtile)10168 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) { 10169 TEST_REQUIRES_ARM_NEON; 10170 for (uint32_t n = 1; n <= 16; n++) { 10171 for (uint32_t m = 1; m <= 4; m++) { 10172 GemmMicrokernelTester() 10173 .mr(4) 10174 .nr(16) 10175 .kr(1) 10176 .sr(1) 10177 .m(m) 10178 .n(n) 10179 .k(8) 10180 .iterations(1) 10181 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10182 } 10183 } 10184 } 10185 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_eq_8_subtile_m)10186 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) { 10187 TEST_REQUIRES_ARM_NEON; 10188 for (uint32_t m = 1; m <= 4; m++) { 10189 GemmMicrokernelTester() 10190 .mr(4) 10191 .nr(16) 10192 .kr(1) 10193 .sr(1) 10194 .m(m) 10195 .n(16) 10196 .k(8) 10197 .iterations(1) 10198 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10199 } 10200 } 10201 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_eq_8_subtile_n)10202 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) { 10203 TEST_REQUIRES_ARM_NEON; 10204 for (uint32_t n = 1; n <= 16; n++) { 10205 GemmMicrokernelTester() 10206 .mr(4) 10207 .nr(16) 10208 .kr(1) 10209 .sr(1) 10210 .m(4) 10211 .n(n) 10212 .k(8) 10213 .iterations(1) 10214 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10215 } 10216 } 10217 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_lt_8)10218 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) { 10219 TEST_REQUIRES_ARM_NEON; 10220 for (size_t k = 1; k < 8; k++) { 10221 GemmMicrokernelTester() 10222 .mr(4) 10223 .nr(16) 10224 .kr(1) 10225 .sr(1) 10226 .m(4) 10227 .n(16) 10228 .k(k) 10229 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10230 } 10231 } 10232 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_lt_8_strided_a)10233 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) { 10234 TEST_REQUIRES_ARM_NEON; 10235 for (size_t k = 1; k < 8; k++) { 10236 GemmMicrokernelTester() 10237 .mr(4) 10238 .nr(16) 10239 .kr(1) 10240 .sr(1) 10241 .m(4) 10242 .n(16) 10243 .k(k) 10244 .a_stride(11) 10245 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10246 } 10247 } 10248 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_lt_8_subtile)10249 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) { 10250 TEST_REQUIRES_ARM_NEON; 10251 for (size_t k = 1; k < 8; k++) { 10252 for (uint32_t n = 1; n <= 16; n++) { 10253 for (uint32_t m = 1; m <= 4; m++) { 10254 GemmMicrokernelTester() 10255 .mr(4) 10256 .nr(16) 10257 .kr(1) 10258 .sr(1) 10259 .m(m) 10260 .n(n) 10261 .k(k) 10262 .iterations(1) 10263 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10264 } 10265 } 10266 } 10267 } 10268 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_gt_8)10269 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) { 10270 TEST_REQUIRES_ARM_NEON; 10271 for (size_t k = 9; k < 16; k++) { 10272 GemmMicrokernelTester() 10273 .mr(4) 10274 .nr(16) 10275 .kr(1) 10276 .sr(1) 10277 .m(4) 10278 .n(16) 10279 .k(k) 10280 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10281 } 10282 } 10283 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_gt_8_strided_a)10284 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) { 10285 TEST_REQUIRES_ARM_NEON; 10286 for (size_t k = 9; k < 16; k++) { 10287 GemmMicrokernelTester() 10288 .mr(4) 10289 .nr(16) 10290 .kr(1) 10291 .sr(1) 10292 .m(4) 10293 .n(16) 10294 .k(k) 10295 .a_stride(19) 10296 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10297 } 10298 } 10299 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_gt_8_subtile)10300 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) { 10301 TEST_REQUIRES_ARM_NEON; 10302 for (size_t k = 9; k < 16; k++) { 10303 for (uint32_t n = 1; n <= 16; n++) { 10304 for (uint32_t m = 1; m <= 4; m++) { 10305 GemmMicrokernelTester() 10306 .mr(4) 10307 .nr(16) 10308 .kr(1) 10309 .sr(1) 10310 .m(m) 10311 .n(n) 10312 .k(k) 10313 .iterations(1) 10314 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10315 } 10316 } 10317 } 10318 } 10319 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_div_8)10320 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) { 10321 TEST_REQUIRES_ARM_NEON; 10322 for (size_t k = 16; k <= 80; k += 8) { 10323 GemmMicrokernelTester() 10324 .mr(4) 10325 .nr(16) 10326 .kr(1) 10327 .sr(1) 10328 .m(4) 10329 .n(16) 10330 .k(k) 10331 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10332 } 10333 } 10334 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_div_8_strided_a)10335 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) { 10336 TEST_REQUIRES_ARM_NEON; 10337 for (size_t k = 16; k <= 80; k += 8) { 10338 GemmMicrokernelTester() 10339 .mr(4) 10340 .nr(16) 10341 .kr(1) 10342 .sr(1) 10343 .m(4) 10344 .n(16) 10345 .k(k) 10346 .a_stride(83) 10347 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10348 } 10349 } 10350 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,k_div_8_subtile)10351 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) { 10352 TEST_REQUIRES_ARM_NEON; 10353 for (size_t k = 16; k <= 80; k += 8) { 10354 for (uint32_t n = 1; n <= 16; n++) { 10355 for (uint32_t m = 1; m <= 4; m++) { 10356 GemmMicrokernelTester() 10357 .mr(4) 10358 .nr(16) 10359 .kr(1) 10360 .sr(1) 10361 .m(m) 10362 .n(n) 10363 .k(k) 10364 .iterations(1) 10365 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10366 } 10367 } 10368 } 10369 } 10370 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,n_gt_16)10371 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) { 10372 TEST_REQUIRES_ARM_NEON; 10373 for (uint32_t n = 17; n < 32; n++) { 10374 for (size_t k = 1; k <= 40; k += 9) { 10375 GemmMicrokernelTester() 10376 .mr(4) 10377 .nr(16) 10378 .kr(1) 10379 .sr(1) 10380 .m(4) 10381 .n(n) 10382 .k(k) 10383 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10384 } 10385 } 10386 } 10387 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,n_gt_16_strided_cn)10388 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) { 10389 TEST_REQUIRES_ARM_NEON; 10390 for (uint32_t n = 17; n < 32; n++) { 10391 for (size_t k = 1; k <= 40; k += 9) { 10392 GemmMicrokernelTester() 10393 .mr(4) 10394 .nr(16) 10395 .kr(1) 10396 .sr(1) 10397 .m(4) 10398 .n(n) 10399 .k(k) 10400 .cn_stride(19) 10401 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10402 } 10403 } 10404 } 10405 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,n_gt_16_strided_a)10406 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_a) { 10407 TEST_REQUIRES_ARM_NEON; 10408 for (uint32_t n = 17; n < 32; n++) { 10409 for (size_t k = 1; k <= 40; k += 9) { 10410 GemmMicrokernelTester() 10411 .mr(4) 10412 .nr(16) 10413 .kr(1) 10414 .sr(1) 10415 .m(4) 10416 .n(n) 10417 .k(k) 10418 .a_stride(43) 10419 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10420 } 10421 } 10422 } 10423 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,n_gt_16_subtile)10424 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) { 10425 TEST_REQUIRES_ARM_NEON; 10426 for (uint32_t n = 17; n < 32; n++) { 10427 for (size_t k = 1; k <= 40; k += 9) { 10428 for (uint32_t m = 1; m <= 4; m++) { 10429 GemmMicrokernelTester() 10430 .mr(4) 10431 .nr(16) 10432 .kr(1) 10433 .sr(1) 10434 .m(m) 10435 .n(n) 10436 .k(k) 10437 .iterations(1) 10438 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10439 } 10440 } 10441 } 10442 } 10443 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,n_div_16)10444 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) { 10445 TEST_REQUIRES_ARM_NEON; 10446 for (uint32_t n = 32; n <= 48; n += 16) { 10447 for (size_t k = 1; k <= 40; k += 9) { 10448 GemmMicrokernelTester() 10449 .mr(4) 10450 .nr(16) 10451 .kr(1) 10452 .sr(1) 10453 .m(4) 10454 .n(n) 10455 .k(k) 10456 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10457 } 10458 } 10459 } 10460 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,n_div_16_strided_cn)10461 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) { 10462 TEST_REQUIRES_ARM_NEON; 10463 for (uint32_t n = 32; n <= 48; n += 16) { 10464 for (size_t k = 1; k <= 40; k += 9) { 10465 GemmMicrokernelTester() 10466 .mr(4) 10467 .nr(16) 10468 .kr(1) 10469 .sr(1) 10470 .m(4) 10471 .n(n) 10472 .k(k) 10473 .cn_stride(19) 10474 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10475 } 10476 } 10477 } 10478 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,n_div_16_strided_a)10479 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_a) { 10480 TEST_REQUIRES_ARM_NEON; 10481 for (uint32_t n = 32; n <= 48; n += 16) { 10482 for (size_t k = 1; k <= 40; k += 9) { 10483 GemmMicrokernelTester() 10484 .mr(4) 10485 .nr(16) 10486 .kr(1) 10487 .sr(1) 10488 .m(4) 10489 .n(n) 10490 .k(k) 10491 .a_stride(43) 10492 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10493 } 10494 } 10495 } 10496 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,n_div_16_subtile)10497 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) { 10498 TEST_REQUIRES_ARM_NEON; 10499 for (uint32_t n = 32; n <= 48; n += 16) { 10500 for (size_t k = 1; k <= 40; k += 9) { 10501 for (uint32_t m = 1; m <= 4; m++) { 10502 GemmMicrokernelTester() 10503 .mr(4) 10504 .nr(16) 10505 .kr(1) 10506 .sr(1) 10507 .m(m) 10508 .n(n) 10509 .k(k) 10510 .iterations(1) 10511 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10512 } 10513 } 10514 } 10515 } 10516 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,strided_cm_subtile)10517 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) { 10518 TEST_REQUIRES_ARM_NEON; 10519 for (size_t k = 1; k <= 40; k += 9) { 10520 for (uint32_t n = 1; n <= 16; n++) { 10521 for (uint32_t m = 1; m <= 4; m++) { 10522 GemmMicrokernelTester() 10523 .mr(4) 10524 .nr(16) 10525 .kr(1) 10526 .sr(1) 10527 .m(m) 10528 .n(n) 10529 .k(k) 10530 .cm_stride(19) 10531 .iterations(1) 10532 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10533 } 10534 } 10535 } 10536 } 10537 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,qmin)10538 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) { 10539 TEST_REQUIRES_ARM_NEON; 10540 GemmMicrokernelTester() 10541 .mr(4) 10542 .nr(16) 10543 .kr(1) 10544 .sr(1) 10545 .m(4) 10546 .n(16) 10547 .k(8) 10548 .qmin(128) 10549 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10550 } 10551 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,qmax)10552 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) { 10553 TEST_REQUIRES_ARM_NEON; 10554 GemmMicrokernelTester() 10555 .mr(4) 10556 .nr(16) 10557 .kr(1) 10558 .sr(1) 10559 .m(4) 10560 .n(16) 10561 .k(8) 10562 .qmax(128) 10563 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10564 } 10565 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,strided_cm)10566 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) { 10567 TEST_REQUIRES_ARM_NEON; 10568 GemmMicrokernelTester() 10569 .mr(4) 10570 .nr(16) 10571 .kr(1) 10572 .sr(1) 10573 .m(4) 10574 .n(16) 10575 .k(8) 10576 .cm_stride(19) 10577 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10578 } 10579 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,no_a_zero_point)10580 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, no_a_zero_point) { 10581 TEST_REQUIRES_ARM_NEON; 10582 for (size_t k = 1; k <= 40; k += 9) { 10583 GemmMicrokernelTester() 10584 .mr(4) 10585 .nr(16) 10586 .kr(1) 10587 .sr(1) 10588 .m(4) 10589 .n(16) 10590 .k(k) 10591 .a_zero_point(0) 10592 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10593 } 10594 } 10595 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,no_b_zero_point)10596 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, no_b_zero_point) { 10597 TEST_REQUIRES_ARM_NEON; 10598 for (size_t k = 1; k <= 40; k += 9) { 10599 GemmMicrokernelTester() 10600 .mr(4) 10601 .nr(16) 10602 .kr(1) 10603 .sr(1) 10604 .m(4) 10605 .n(16) 10606 .k(k) 10607 .b_zero_point(0) 10608 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10609 } 10610 } 10611 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64,no_zero_point)10612 TEST(QU8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, no_zero_point) { 10613 TEST_REQUIRES_ARM_NEON; 10614 for (size_t k = 1; k <= 40; k += 9) { 10615 GemmMicrokernelTester() 10616 .mr(4) 10617 .nr(16) 10618 .kr(1) 10619 .sr(1) 10620 .m(4) 10621 .n(16) 10622 .k(k) 10623 .a_zero_point(0) 10624 .b_zero_point(0) 10625 .Test(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qu8_conv_minmax_rndnu_neon_params, xnn_qu8_requantize_rndnu); 10626 } 10627 } 10628 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY 10629