1 // Copyright 2022 Google LLC 2 // 3 // This source code is licensed under the BSD-style license found in the 4 // LICENSE file in the root directory of this source tree. 5 6 #pragma once 7 8 #include <stddef.h> 9 #include <stdint.h> 10 11 #include <xnnpack/common.h> 12 13 14 // Default: serves to differentiate pointer types for micro-kernels without fused activation. 15 16 union xnn_f16_default_params { 17 char _; // Dummy member variable to comply with the C standard 18 }; 19 20 union xnn_f32_default_params { 21 char _; // Dummy member variable to comply with the C standard 22 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 23 struct { 24 int32_t mask_table[14]; 25 } avx; 26 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 27 }; 28 29 30 // ReLU: serves to differentiate pointer types for micro-kernels with fused ReLU activation. 31 32 union xnn_f32_relu_params { 33 char _; // Dummy member variable to comply with the C standard 34 }; 35 36 37 // Scale+Min+Max: used by AVGPOOL/GAVGPOOL microkernels. 38 39 union xnn_f16_scaleminmax_params { 40 char _; // Dummy member variable to comply with the C standard 41 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 42 struct { 43 uint16_t scale; 44 uint16_t min; 45 uint16_t max; 46 } neon; 47 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 48 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 49 struct { 50 XNN_ALIGN(32) float scale[8]; 51 XNN_ALIGN(32) float min[8]; 52 XNN_ALIGN(32) float max[8]; 53 } avx; 54 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 55 }; 56 57 union xnn_f32_scaleminmax_params { 58 struct { 59 float scale; 60 float min; 61 float max; 62 } scalar; 63 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 64 struct { 65 XNN_ALIGN(16) float scale[4]; 66 XNN_ALIGN(16) float min[4]; 67 XNN_ALIGN(16) float max[4]; 68 } sse; 69 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 70 }; 71 72 73 // Min+Max: used by VCLAMP and GEMM/IGEMM/DWCONV/MAXPOOL/etc with MINMAX activation. 74 75 union xnn_bf16_minmax_params { 76 struct { 77 float min; 78 float max; 79 } scalar; 80 }; 81 82 union xnn_f16_minmax_params { 83 char _; // Dummy member variable to comply with the C standard 84 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 85 struct { 86 uint16_t min; 87 uint16_t max; 88 } neon; 89 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 90 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 91 struct { 92 XNN_ALIGN(32) float min[8]; 93 XNN_ALIGN(32) float max[8]; 94 } avx; 95 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 96 }; 97 98 union xnn_f32_minmax_params { 99 struct { 100 float min; 101 float max; 102 } scalar; 103 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 104 struct { 105 XNN_ALIGN(16) float min[4]; 106 XNN_ALIGN(16) float max[4]; 107 } sse; 108 struct { 109 XNN_ALIGN(32) float min[8]; 110 XNN_ALIGN(32) float max[8]; 111 int32_t mask_table[14]; 112 } avx; 113 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 114 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 115 struct { 116 XNN_ALIGN(8) float min[2]; 117 XNN_ALIGN(8) float max[2]; 118 } wasmsimd; 119 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 120 }; 121 122 union xnn_s8_minmax_params { 123 struct { 124 int32_t min; 125 int32_t max; 126 } scalar; 127 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 128 struct { 129 XNN_ALIGN(16) uint8_t bias[16]; 130 XNN_ALIGN(16) uint8_t min_with_bias[16]; 131 XNN_ALIGN(16) uint8_t max_with_bias[16]; 132 } sse2; 133 struct { 134 XNN_ALIGN(16) int8_t min[16]; 135 XNN_ALIGN(16) int8_t max[16]; 136 } sse4; 137 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 138 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 139 struct { 140 int8_t min; 141 int8_t max; 142 } neon; 143 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 144 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 145 struct { 146 XNN_ALIGN(8) int8_t min[8]; 147 XNN_ALIGN(8) int8_t max[8]; 148 } wasmsimd; 149 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 150 }; 151 152 union xnn_u8_minmax_params { 153 struct { 154 uint32_t min; 155 uint32_t max; 156 } scalar; 157 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 158 struct { 159 XNN_ALIGN(16) uint8_t min[16]; 160 XNN_ALIGN(16) uint8_t max[16]; 161 } sse2; 162 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 163 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 164 struct { 165 uint8_t min; 166 uint8_t max; 167 } neon; 168 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 169 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 170 struct { 171 XNN_ALIGN(8) uint8_t min[8]; 172 XNN_ALIGN(8) uint8_t max[8]; 173 } wasmsimd; 174 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 175 }; 176 177 178 // Conv w. Min+Max: used by quantized GEMM/IGEMM/DWCONV microkernels with MINMAX activation. 179 180 union xnn_qc8_conv_minmax_params { 181 struct { 182 float magic_bias; 183 int32_t magic_min; 184 int32_t magic_max; 185 int32_t magic_bias_less_zero_point; 186 } fp32_scalar_imagic; 187 struct { 188 float output_min_less_zero_point; 189 float output_max_less_zero_point; 190 float magic_bias; 191 int32_t magic_bias_less_output_zero_point; 192 } fp32_scalar_fmagic; 193 struct { 194 float output_min_less_zero_point; 195 float output_max_less_zero_point; 196 int32_t output_zero_point; 197 } fp32_scalar_lrintf; 198 #if XNN_ARCH_ARM 199 struct { 200 float magic_bias; 201 int32_t magic_bias_less_zero_point; 202 uint32_t output_min; 203 uint32_t output_max; 204 } fp32_armsimd32; 205 #endif // XNN_ARCH_ARM 206 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 207 struct { 208 float magic_bias; 209 int32_t magic_bias_less_output_zero_point; 210 int8_t output_min; 211 int8_t output_max; 212 } fp32_neon; 213 struct { 214 int16_t output_zero_point; 215 uint8_t output_min; 216 uint8_t output_max; 217 } fp32_neonv8; 218 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 219 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 220 struct { 221 XNN_ALIGN(16) float output_max_less_zero_point[4]; 222 XNN_ALIGN(16) int16_t output_zero_point[8]; 223 XNN_ALIGN(16) int16_t output_min[8]; 224 } fp32_sse2; 225 struct { 226 XNN_ALIGN(16) float output_max_less_zero_point[4]; 227 XNN_ALIGN(16) int16_t output_zero_point[8]; 228 XNN_ALIGN(16) int8_t output_min[16]; 229 } fp32_sse4; 230 struct { 231 XNN_ALIGN(32) float output_max_less_zero_point[8]; 232 XNN_ALIGN(32) int16_t output_zero_point[16]; 233 XNN_ALIGN(32) int8_t output_min[32]; 234 } fp32_avx2; 235 struct { 236 XNN_ALIGN(64) float output_max_less_zero_point[16]; 237 XNN_ALIGN(64) int16_t output_zero_point[32]; 238 XNN_ALIGN(64) int8_t output_min[64]; 239 } fp32_avx512; 240 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 241 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 242 struct { 243 XNN_ALIGN(8) float magic_bias[2]; 244 XNN_ALIGN(8) int32_t magic_min[2]; 245 XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; 246 XNN_ALIGN(8) int8_t output_max[8]; 247 } fp32_wasmsimd; 248 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 249 }; 250 251 union xnn_qs8_conv_minmax_params { 252 struct { 253 float scale; 254 float output_min_less_zero_point; 255 float output_max_less_zero_point; 256 float magic_bias; 257 int32_t magic_bias_less_output_zero_point; 258 } fp32_scalar_fmagic; 259 struct { 260 float scale; 261 float magic_bias; 262 int32_t magic_min; 263 int32_t magic_max; 264 int32_t magic_bias_less_zero_point; 265 } fp32_scalar_imagic; 266 struct { 267 float scale; 268 float output_min_less_zero_point; 269 float output_max_less_zero_point; 270 int32_t output_zero_point; 271 } fp32_scalar_lrintf; 272 #if XNN_ARCH_ARM 273 struct { 274 float scale; 275 float magic_bias; 276 int32_t magic_bias_less_zero_point; 277 uint32_t output_min; 278 uint32_t output_max; 279 } fp32_armsimd32; 280 #endif // XNN_ARCH_ARM 281 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 282 struct { 283 float scale; 284 float magic_bias; 285 int32_t magic_bias_less_output_zero_point; 286 int8_t output_min; 287 int8_t output_max; 288 } fp32_neon; 289 struct { 290 float scale; 291 int16_t output_zero_point; 292 int8_t output_min; 293 int8_t output_max; 294 } fp32_neonv8; 295 struct { 296 int32_t right_pre_shift; 297 int32_t multiplier; 298 int32_t right_post_shift; 299 int16_t output_zero_point; 300 int8_t output_min; 301 int8_t output_max; 302 } rndnu_neon; 303 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 304 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 305 struct { 306 XNN_ALIGN(16) float scale[4]; 307 XNN_ALIGN(16) float output_max_less_zero_point[4]; 308 XNN_ALIGN(16) int16_t output_zero_point[8]; 309 XNN_ALIGN(16) int16_t output_min[8]; 310 } fp32_sse2; 311 struct { 312 XNN_ALIGN(16) float scale[4]; 313 XNN_ALIGN(16) float output_max_less_zero_point[4]; 314 XNN_ALIGN(16) int16_t output_zero_point[8]; 315 XNN_ALIGN(16) int8_t output_min[16]; 316 } fp32_sse4; 317 struct { 318 XNN_ALIGN(32) float scale[8]; 319 XNN_ALIGN(32) float output_max_less_zero_point[8]; 320 XNN_ALIGN(32) int16_t output_zero_point[16]; 321 XNN_ALIGN(32) int8_t output_min[32]; 322 } fp32_avx2; 323 struct { 324 XNN_ALIGN(64) float scale[16]; 325 XNN_ALIGN(64) float output_max_less_zero_point[16]; 326 XNN_ALIGN(64) int16_t output_zero_point[32]; 327 XNN_ALIGN(64) int8_t output_min[64]; 328 } fp32_avx512; 329 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 330 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 331 struct { 332 XNN_ALIGN(8) float scale[2]; 333 XNN_ALIGN(8) float magic_bias[2]; 334 XNN_ALIGN(8) int32_t magic_min[2]; 335 XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; 336 XNN_ALIGN(8) int8_t output_max[8]; 337 } fp32_wasmsimd; 338 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 339 }; 340 341 union xnn_qu8_conv_minmax_params { 342 struct { 343 int32_t kernel_zero_point; 344 float scale; 345 float output_min_less_zero_point; 346 float output_max_less_zero_point; 347 float magic_bias; 348 int32_t magic_bias_less_output_zero_point; 349 } fp32_scalar_fmagic; 350 struct { 351 int32_t kernel_zero_point; 352 float scale; 353 float magic_bias; 354 int32_t magic_min; 355 int32_t magic_max; 356 int32_t magic_bias_less_zero_point; 357 } fp32_scalar_imagic; 358 struct { 359 int32_t kernel_zero_point; 360 float scale; 361 float output_min_less_zero_point; 362 float output_max_less_zero_point; 363 int32_t output_zero_point; 364 } fp32_scalar_lrintf; 365 #if XNN_ARCH_ARM 366 struct { 367 float scale; 368 float magic_bias; 369 uint32_t minus_kernel_zero_point; 370 int32_t magic_bias_less_zero_point; 371 uint32_t output_min; 372 uint32_t output_max; 373 } fp32_armsimd32; 374 #endif // XNN_ARCH_ARM 375 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 376 struct { 377 uint8_t kernel_zero_point[4]; 378 float scale; 379 float magic_bias; 380 int32_t magic_bias_less_output_zero_point; 381 uint8_t output_min; 382 uint8_t output_max; 383 } fp32_neon; 384 struct { 385 uint8_t kernel_zero_point[4]; 386 float scale; 387 int16_t output_zero_point; 388 uint8_t output_min; 389 uint8_t output_max; 390 } fp32_neonv8; 391 struct { 392 uint8_t kernel_zero_point[4]; 393 int32_t right_pre_shift; 394 int32_t multiplier; 395 int32_t right_post_shift; 396 int16_t output_zero_point; 397 uint8_t output_min; 398 uint8_t output_max; 399 } rndnu_neon; 400 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 401 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 402 struct { 403 XNN_ALIGN(16) int16_t kernel_zero_point[8]; 404 XNN_ALIGN(16) float scale[4]; 405 XNN_ALIGN(16) float output_max_less_zero_point[4]; 406 XNN_ALIGN(16) int16_t output_zero_point[8]; 407 XNN_ALIGN(16) uint8_t output_min[16]; 408 } fp32_sse2; 409 struct { 410 XNN_ALIGN(32) int16_t kernel_zero_point[16]; 411 XNN_ALIGN(32) float scale[8]; 412 XNN_ALIGN(32) float output_max_less_zero_point[8]; 413 XNN_ALIGN(32) int16_t output_zero_point[16]; 414 XNN_ALIGN(32) uint8_t output_min[32]; 415 } fp32_avx2; 416 struct { 417 XNN_ALIGN(64) int16_t kernel_zero_point[32]; 418 XNN_ALIGN(64) float scale[16]; 419 XNN_ALIGN(64) float output_max_less_zero_point[16]; 420 XNN_ALIGN(64) int16_t output_zero_point[32]; 421 XNN_ALIGN(64) uint8_t output_min[64]; 422 } fp32_avx512; 423 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 424 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 425 struct { 426 XNN_ALIGN(8) int16_t kernel_zero_point[4]; 427 XNN_ALIGN(8) float scale[2]; 428 XNN_ALIGN(8) float magic_bias[2]; 429 XNN_ALIGN(8) int32_t magic_min[2]; 430 XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; 431 XNN_ALIGN(8) int8_t output_max[8]; 432 } fp32_wasmsimd; 433 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 434 }; 435 436 437 // Add w. Min+Max: used by quantized VADD[C] microkernels with MINMAX activation. 438 439 union xnn_qs8_add_minmax_params { 440 struct { 441 int32_t bias; 442 int32_t a_multiplier; 443 int32_t b_multiplier; 444 uint32_t shift; 445 int32_t output_min_less_zero_point; 446 int32_t output_max_less_zero_point; 447 int32_t output_zero_point; 448 } scalar; 449 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 450 struct { 451 int8_t a_zero_point; 452 int8_t b_zero_point; 453 int16_t output_zero_point; 454 int32_t a_multiplier; 455 int32_t b_multiplier; 456 int32_t right_shift; 457 int8_t output_min; 458 int8_t output_max; 459 } neon; 460 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 461 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 462 struct { 463 XNN_ALIGN(16) int32_t bias[4]; 464 XNN_ALIGN(16) uint16_t a_multiplier_lo[8]; 465 XNN_ALIGN(16) uint16_t a_multiplier_hi[8]; 466 XNN_ALIGN(16) uint16_t b_multiplier_lo[8]; 467 XNN_ALIGN(16) uint16_t b_multiplier_hi[8]; 468 uint32_t shift; 469 uint32_t b_multiplier; 470 XNN_ALIGN(16) int16_t output_zero_point[8]; 471 XNN_ALIGN(16) int16_t output_min[8]; 472 XNN_ALIGN(16) int16_t output_max[8]; 473 } sse2; 474 struct { 475 XNN_ALIGN(16) int32_t bias[4]; 476 XNN_ALIGN(16) uint16_t a_multiplier_lo[8]; 477 XNN_ALIGN(16) uint16_t a_multiplier_hi[8]; 478 XNN_ALIGN(16) uint16_t b_multiplier_lo[8]; 479 XNN_ALIGN(16) uint16_t b_multiplier_hi[8]; 480 uint32_t shift; 481 uint32_t b_multiplier; 482 XNN_ALIGN(16) int16_t output_zero_point[8]; 483 XNN_ALIGN(16) int8_t output_min[16]; 484 XNN_ALIGN(16) int8_t output_max[16]; 485 } sse4_mul16; 486 struct { 487 XNN_ALIGN(16) int32_t bias[4]; 488 XNN_ALIGN(16) int32_t a_multiplier[4]; 489 XNN_ALIGN(16) int32_t b_multiplier[4]; 490 XNN_ALIGN(16) uint64_t shift[2]; 491 XNN_ALIGN(16) int16_t output_zero_point[8]; 492 XNN_ALIGN(16) int8_t output_min[16]; 493 XNN_ALIGN(16) int8_t output_max[16]; 494 } sse4_mul32; 495 struct { 496 XNN_ALIGN(32) int32_t bias[8]; 497 XNN_ALIGN(32) int32_t a_multiplier[8]; 498 XNN_ALIGN(32) int32_t b_multiplier[8]; 499 XNN_ALIGN(32) uint64_t shift[4]; 500 XNN_ALIGN(32) int16_t output_zero_point[16]; 501 XNN_ALIGN(16) int8_t output_min[16]; 502 XNN_ALIGN(16) int8_t output_max[16]; 503 } avx2; 504 struct { 505 XNN_ALIGN(64) int32_t bias[16]; 506 XNN_ALIGN(64) int32_t a_multiplier[16]; 507 XNN_ALIGN(64) int32_t b_multiplier[16]; 508 XNN_ALIGN(64) uint64_t shift[8]; 509 XNN_ALIGN(64) int16_t output_zero_point[32]; 510 XNN_ALIGN(32) int8_t output_min[32]; 511 XNN_ALIGN(32) int8_t output_max[32]; 512 } avx512; 513 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 514 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 515 struct { 516 XNN_ALIGN(8) int32_t bias[2]; 517 XNN_ALIGN(8) int32_t a_multiplier[2]; 518 XNN_ALIGN(8) int32_t b_multiplier[2]; 519 uint32_t shift; 520 XNN_ALIGN(8) int16_t output_zero_point[4]; 521 XNN_ALIGN(8) int8_t output_min[8]; 522 XNN_ALIGN(8) int8_t output_max[8]; 523 } wasmsimd; 524 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 525 }; 526 527 union xnn_qu8_add_minmax_params { 528 struct { 529 int32_t bias; 530 int32_t a_multiplier; 531 int32_t b_multiplier; 532 int32_t rounding; 533 uint32_t shift; 534 int32_t output_min_less_zero_point; 535 int32_t output_max_less_zero_point; 536 int32_t output_zero_point; 537 } scalar; 538 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 539 struct { 540 uint8_t a_zero_point; 541 uint8_t b_zero_point; 542 int16_t output_zero_point; 543 int32_t a_multiplier; 544 int32_t b_multiplier; 545 int32_t right_shift; 546 uint8_t output_min; 547 uint8_t output_max; 548 } neon; 549 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 550 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 551 struct { 552 XNN_ALIGN(16) int32_t bias[4]; 553 XNN_ALIGN(16) uint16_t a_multiplier_lo[8]; 554 XNN_ALIGN(16) uint16_t a_multiplier_hi[8]; 555 XNN_ALIGN(16) uint16_t b_multiplier_lo[8]; 556 XNN_ALIGN(16) uint16_t b_multiplier_hi[8]; 557 uint32_t shift; 558 uint32_t b_multiplier; 559 XNN_ALIGN(16) int16_t output_zero_point[8]; 560 XNN_ALIGN(16) uint8_t output_min[16]; 561 XNN_ALIGN(16) uint8_t output_max[16]; 562 } sse2; 563 struct { 564 XNN_ALIGN(16) int32_t bias[4]; 565 XNN_ALIGN(16) int32_t a_multiplier[4]; 566 XNN_ALIGN(16) int32_t b_multiplier[4]; 567 XNN_ALIGN(16) uint64_t shift[2]; 568 XNN_ALIGN(16) int16_t output_zero_point[8]; 569 XNN_ALIGN(16) uint8_t output_min[16]; 570 XNN_ALIGN(16) uint8_t output_max[16]; 571 } sse4; 572 struct { 573 XNN_ALIGN(32) int32_t bias[8]; 574 XNN_ALIGN(32) int32_t a_multiplier[8]; 575 XNN_ALIGN(32) int32_t b_multiplier[8]; 576 XNN_ALIGN(32) uint64_t shift[4]; 577 XNN_ALIGN(32) int16_t output_zero_point[16]; 578 XNN_ALIGN(16) uint8_t output_min[16]; 579 XNN_ALIGN(16) uint8_t output_max[16]; 580 } avx2; 581 struct { 582 XNN_ALIGN(64) int32_t bias[16]; 583 XNN_ALIGN(64) int32_t a_multiplier[16]; 584 XNN_ALIGN(64) int32_t b_multiplier[16]; 585 XNN_ALIGN(64) uint64_t shift[8]; 586 XNN_ALIGN(64) int16_t output_zero_point[32]; 587 XNN_ALIGN(32) uint8_t output_min[32]; 588 XNN_ALIGN(32) uint8_t output_max[32]; 589 } avx512; 590 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 591 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 592 struct { 593 XNN_ALIGN(8) int32_t bias[2]; 594 XNN_ALIGN(8) int32_t a_multiplier[2]; 595 XNN_ALIGN(8) int32_t b_multiplier[2]; 596 uint32_t shift; 597 XNN_ALIGN(8) int16_t output_zero_point[4]; 598 XNN_ALIGN(8) uint8_t output_min[8]; 599 XNN_ALIGN(8) uint8_t output_max[8]; 600 } wasmsimd; 601 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 602 }; 603 604 605 // Mul w. Min+Max: used by quantized VMUL[C] microkernels with MINMAX activation. 606 607 union xnn_qs8_mul_minmax_params { 608 struct { 609 int32_t a_zero_point; 610 int32_t b_zero_point; 611 float scale; 612 float output_min_less_zero_point; 613 float output_max_less_zero_point; 614 float magic_bias; 615 int32_t magic_bias_less_output_zero_point; 616 } fp32_scalar; 617 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 618 struct { 619 int8_t a_zero_point[2]; 620 int8_t b_zero_point[2]; 621 float scale; 622 float magic_bias; 623 int32_t magic_bias_less_output_zero_point; 624 int8_t output_min; 625 int8_t output_max; 626 } fp32_neon; 627 struct { 628 int8_t a_zero_point[2]; 629 int8_t b_zero_point[2]; 630 float scale; 631 int16_t output_zero_point; 632 int8_t output_min; 633 int8_t output_max; 634 } fp32_neonv8; 635 struct { 636 int8_t a_zero_point[2]; 637 int8_t b_zero_point[2]; 638 int32_t left_pre_shift; 639 int32_t multiplier; 640 int32_t left_post_shift; 641 int16_t output_zero_point; 642 int8_t output_min; 643 int8_t output_max; 644 } rndnu_neon; 645 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 646 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 647 struct { 648 XNN_ALIGN(16) int16_t a_zero_point[8]; 649 XNN_ALIGN(16) int16_t b_zero_point[8]; 650 XNN_ALIGN(16) float scale[4]; 651 XNN_ALIGN(16) int16_t output_zero_point[8]; 652 XNN_ALIGN(16) int16_t output_min[8]; 653 XNN_ALIGN(16) int16_t output_max[8]; 654 } fp32_sse2; 655 struct { 656 XNN_ALIGN(16) int16_t a_zero_point[8]; 657 XNN_ALIGN(16) int16_t b_zero_point[8]; 658 XNN_ALIGN(16) float scale[4]; 659 XNN_ALIGN(16) int16_t output_zero_point[8]; 660 XNN_ALIGN(16) int8_t output_min[16]; 661 XNN_ALIGN(16) int8_t output_max[16]; 662 } fp32_sse4; 663 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 664 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 665 struct { 666 XNN_ALIGN(8) int16_t a_zero_point[4]; 667 XNN_ALIGN(8) int16_t b_zero_point[4]; 668 XNN_ALIGN(8) float scale[2]; 669 XNN_ALIGN(8) float magic_bias[2]; 670 XNN_ALIGN(8) int32_t magic_min[2]; 671 XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; 672 XNN_ALIGN(8) int8_t output_max[8]; 673 } fp32_wasmsimd; 674 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 675 }; 676 677 union xnn_qu8_mul_minmax_params { 678 struct { 679 int32_t a_zero_point; 680 int32_t b_zero_point; 681 float scale; 682 float output_min_less_zero_point; 683 float output_max_less_zero_point; 684 float magic_bias; 685 int32_t magic_bias_less_output_zero_point; 686 } fp32_scalar; 687 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 688 struct { 689 uint8_t a_zero_point[2]; 690 uint8_t b_zero_point[2]; 691 float scale; 692 float magic_bias; 693 int32_t magic_bias_less_output_zero_point; 694 uint8_t output_min; 695 uint8_t output_max; 696 } fp32_neon; 697 struct { 698 uint8_t a_zero_point[2]; 699 uint8_t b_zero_point[2]; 700 float scale; 701 int16_t output_zero_point; 702 uint8_t output_min; 703 uint8_t output_max; 704 } fp32_neonv8; 705 struct { 706 uint8_t a_zero_point[2]; 707 uint8_t b_zero_point[2]; 708 int32_t left_pre_shift; 709 int32_t multiplier; 710 int32_t left_post_shift; 711 int16_t output_zero_point; 712 uint8_t output_min; 713 uint8_t output_max; 714 } rndnu_neon; 715 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 716 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 717 struct { 718 XNN_ALIGN(16) int16_t a_zero_point[8]; 719 XNN_ALIGN(16) int16_t b_zero_point[8]; 720 XNN_ALIGN(16) float scale[4]; 721 XNN_ALIGN(16) int16_t output_zero_point[8]; 722 XNN_ALIGN(16) uint8_t output_min[16]; 723 XNN_ALIGN(16) uint8_t output_max[16]; 724 } fp32_sse2; 725 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 726 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 727 struct { 728 XNN_ALIGN(8) int16_t a_zero_point[4]; 729 XNN_ALIGN(8) int16_t b_zero_point[4]; 730 XNN_ALIGN(8) float scale[2]; 731 XNN_ALIGN(8) float magic_bias[2]; 732 XNN_ALIGN(8) int32_t magic_min[2]; 733 XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; 734 XNN_ALIGN(8) uint8_t output_max[8]; 735 } fp32_wasmsimd; 736 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 737 }; 738 739 740 // AvgPool w. Min+Max: used by quantized GAVGPOOL microkernels with MINMAX activation. 741 742 union xnn_qs8_avgpool_minmax_params { 743 struct { 744 int32_t init_bias; 745 float scale; 746 float output_min_less_zero_point; 747 float output_max_less_zero_point; 748 float magic_bias; 749 int32_t magic_bias_less_output_zero_point; 750 } fp32_scalar_fmagic; 751 struct { 752 int32_t init_bias; 753 float scale; 754 float magic_bias; 755 int32_t magic_min; 756 int32_t magic_max; 757 int32_t magic_bias_less_zero_point; 758 } fp32_scalar_imagic; 759 struct { 760 int32_t init_bias; 761 float scale; 762 float output_min_less_zero_point; 763 float output_max_less_zero_point; 764 int32_t output_zero_point; 765 } fp32_scalar_lrintf; 766 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 767 struct { 768 int32_t init_bias; 769 float scale; 770 float magic_bias; 771 int32_t magic_bias_less_output_zero_point; 772 int8_t output_min; 773 int8_t output_max; 774 } fp32_neon; 775 struct { 776 int32_t init_bias; 777 float scale; 778 int16_t output_zero_point; 779 int8_t output_min; 780 int8_t output_max; 781 } fp32_neonv8; 782 struct { 783 int32_t init_bias; 784 int32_t left_pre_shift; 785 int32_t multiplier; 786 int32_t left_post_shift; 787 int16_t output_zero_point; 788 int8_t output_min; 789 int8_t output_max; 790 } rndnu_neon; 791 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 792 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 793 struct { 794 XNN_ALIGN(16) int32_t init_bias[4]; 795 XNN_ALIGN(16) float scale[4]; 796 XNN_ALIGN(16) float output_max_less_zero_point[4]; 797 XNN_ALIGN(16) int16_t output_zero_point[8]; 798 XNN_ALIGN(16) int16_t output_min[8]; 799 } fp32_sse2; 800 struct { 801 XNN_ALIGN(16) int32_t init_bias[4]; 802 XNN_ALIGN(16) float scale[4]; 803 XNN_ALIGN(16) float output_max_less_zero_point[4]; 804 XNN_ALIGN(16) int16_t output_zero_point[8]; 805 XNN_ALIGN(16) int8_t output_min[16]; 806 } fp32_sse4; 807 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 808 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 809 struct { 810 XNN_ALIGN(8) int32_t init_bias[2]; 811 XNN_ALIGN(8) float scale[2]; 812 XNN_ALIGN(8) float magic_bias[2]; 813 XNN_ALIGN(8) int32_t magic_min[2]; 814 XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; 815 XNN_ALIGN(8) int8_t output_max[8]; 816 } fp32_wasmsimd; 817 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 818 }; 819 820 union xnn_qu8_avgpool_minmax_params { 821 struct { 822 int32_t init_bias; 823 float scale; 824 float output_min_less_zero_point; 825 float output_max_less_zero_point; 826 float magic_bias; 827 int32_t magic_bias_less_output_zero_point; 828 } fp32_scalar_fmagic; 829 struct { 830 int32_t init_bias; 831 float scale; 832 float magic_bias; 833 int32_t magic_min; 834 int32_t magic_max; 835 int32_t magic_bias_less_zero_point; 836 } fp32_scalar_imagic; 837 struct { 838 int32_t init_bias; 839 float scale; 840 float output_min_less_zero_point; 841 float output_max_less_zero_point; 842 int32_t output_zero_point; 843 } fp32_scalar_lrintf; 844 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 845 struct { 846 int32_t init_bias; 847 float scale; 848 float magic_bias; 849 int32_t magic_bias_less_output_zero_point; 850 uint8_t output_min; 851 uint8_t output_max; 852 } fp32_neon; 853 struct { 854 int32_t init_bias; 855 float scale; 856 int16_t output_zero_point; 857 uint8_t output_min; 858 uint8_t output_max; 859 } fp32_neonv8; 860 struct { 861 int32_t init_bias; 862 int32_t left_pre_shift; 863 int32_t multiplier; 864 int32_t left_post_shift; 865 int16_t output_zero_point; 866 uint8_t output_min; 867 uint8_t output_max; 868 } rndnu_neon; 869 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 870 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 871 struct { 872 XNN_ALIGN(16) int32_t init_bias[4]; 873 XNN_ALIGN(16) float scale[4]; 874 XNN_ALIGN(16) float output_max_less_zero_point[4]; 875 XNN_ALIGN(16) int16_t output_zero_point[8]; 876 XNN_ALIGN(16) uint8_t output_min[16]; 877 } fp32_sse2; 878 struct { 879 XNN_ALIGN(16) int32_t init_bias[4]; 880 XNN_ALIGN(16) float scale[4]; 881 XNN_ALIGN(16) float output_max_less_zero_point[4]; 882 XNN_ALIGN(16) int16_t output_zero_point[8]; 883 XNN_ALIGN(16) uint8_t output_min[16]; 884 } fp32_sse4; 885 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 886 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 887 struct { 888 XNN_ALIGN(8) int32_t init_bias[2]; 889 XNN_ALIGN(8) float scale[2]; 890 XNN_ALIGN(8) float magic_bias[2]; 891 XNN_ALIGN(8) int32_t magic_min[2]; 892 XNN_ALIGN(8) int32_t magic_bias_less_output_zero_point[2]; 893 XNN_ALIGN(8) uint8_t output_max[8]; 894 } fp32_wasmsimd; 895 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 896 897 // Legacy parameters used by QU8 AVGPOOL microkernels 898 struct { 899 int32_t bias; 900 int32_t multiplier; 901 int64_t rounding; 902 uint32_t right_shift; 903 int32_t output_min_less_zero_point; 904 int32_t output_max_less_zero_point; 905 int32_t output_zero_point; 906 } scalar; 907 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 908 struct { 909 int32_t bias; 910 int32_t multiplier; 911 int64_t left_shift; 912 int16_t output_zero_point; 913 uint8_t output_min; 914 uint8_t output_max; 915 } neon; 916 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 917 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 918 struct { 919 XNN_ALIGN(16) int32_t bias[4]; 920 XNN_ALIGN(16) uint32_t multiplier[4]; 921 XNN_ALIGN(16) uint64_t rounding[2]; 922 XNN_ALIGN(16) uint64_t right_shift[2]; 923 XNN_ALIGN(16) int16_t output_zero_point[8]; 924 XNN_ALIGN(16) uint8_t output_min[16]; 925 XNN_ALIGN(16) uint8_t output_max[16]; 926 } sse2; 927 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 928 }; 929 930 931 // Abs: used by VABS microkernels. 932 933 union xnn_f16_abs_params { 934 char _; // Dummy member variable to comply with the C standard 935 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 936 struct { 937 XNN_ALIGN(16) uint16_t nonsign_mask[8]; 938 } sse; 939 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 940 }; 941 942 union xnn_f32_abs_params { 943 char _; // Dummy member variable to comply with the C standard 944 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 945 struct { 946 XNN_ALIGN(16) float nonsign_mask[4]; 947 } sse; 948 struct { 949 XNN_ALIGN(32) float nonsign_mask[8]; 950 int32_t mask_table[14]; 951 } avx; 952 struct { 953 uint32_t nonsign_mask; 954 } avx512; 955 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 956 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 957 struct { 958 XNN_ALIGN(8) float nonsign_mask[2]; 959 } wasmsimd; 960 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 961 }; 962 963 964 // Cvt (Convert): used by VCVT microkernels. 965 966 union xnn_f16_f32_cvt_params { 967 struct { 968 uint32_t sign_mask; 969 uint32_t exp_offset; 970 float exp_scale; 971 uint32_t magic_mask; 972 float magic_bias; 973 uint32_t denorm_cutoff; 974 } scalar; 975 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 976 struct { 977 float exp_scale; 978 } neon; 979 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 980 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 981 struct { 982 XNN_ALIGN(16) uint16_t sign_mask[8]; 983 XNN_ALIGN(16) uint16_t exp_offset[8]; 984 XNN_ALIGN(16) float exp_scale[4]; 985 XNN_ALIGN(16) uint16_t magic_mask[8]; 986 XNN_ALIGN(16) float magic_bias[4]; 987 XNN_ALIGN(16) int16_t denorm_cutoff[8]; 988 } sse_int16; 989 struct { 990 XNN_ALIGN(16) uint32_t sign_mask[4]; 991 XNN_ALIGN(16) uint32_t exp_offset[4]; 992 XNN_ALIGN(16) float exp_scale[4]; 993 XNN_ALIGN(16) uint32_t magic_bias[4]; 994 XNN_ALIGN(16) int32_t denorm_cutoff[4]; 995 } sse_int32; 996 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 997 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 998 struct { 999 XNN_ALIGN(8) uint16_t sign_mask[4]; 1000 XNN_ALIGN(8) uint16_t exp_offset[4]; 1001 XNN_ALIGN(8) float exp_scale[2]; 1002 XNN_ALIGN(8) uint16_t magic_mask[4]; 1003 XNN_ALIGN(8) float magic_bias[2]; 1004 XNN_ALIGN(8) int16_t denorm_cutoff[4]; 1005 } wasmsimd_int16; 1006 struct { 1007 XNN_ALIGN(8) uint32_t sign_mask[2]; 1008 XNN_ALIGN(8) uint32_t exp_offset[2]; 1009 XNN_ALIGN(8) float exp_scale[2]; 1010 XNN_ALIGN(8) uint32_t magic_bias[2]; 1011 XNN_ALIGN(8) int32_t denorm_cutoff[2]; 1012 } wasmsimd_int32; 1013 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1014 }; 1015 1016 union xnn_f32_f16_cvt_params { 1017 struct { 1018 uint32_t nonsign_mask; 1019 uint32_t exp_bias; 1020 float scale_to_inf; 1021 uint32_t expw_max; 1022 float scale_to_zero; 1023 uint32_t bias_min; 1024 uint16_t exph_mask; 1025 uint16_t manth_mask; 1026 uint16_t nanh; 1027 } scalar_bitcast; 1028 struct { 1029 float scale_to_inf; 1030 uint32_t exp_bias; 1031 float scale_to_zero; 1032 uint32_t expw_max; 1033 uint32_t bias_min; 1034 uint16_t exph_mask; 1035 uint16_t manth_mask; 1036 uint16_t nanh; 1037 } scalar_fabsf; 1038 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1039 struct { 1040 uint32_t exp_bias; 1041 float scale_to_inf; 1042 uint32_t expw_max; 1043 float scale_to_zero; 1044 } neon; 1045 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1046 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1047 struct { 1048 XNN_ALIGN(16) uint32_t nonsign_mask[4]; 1049 XNN_ALIGN(16) uint32_t exp_bias[4]; 1050 XNN_ALIGN(16) float scale_to_inf[4]; 1051 XNN_ALIGN(16) uint32_t expw_max[4]; 1052 XNN_ALIGN(16) float scale_to_zero[4]; 1053 XNN_ALIGN(16) int16_t bias_min[8]; 1054 XNN_ALIGN(16) uint32_t manth_mask[4]; 1055 XNN_ALIGN(16) uint32_t exph_mask[4]; 1056 XNN_ALIGN(16) uint16_t nanh[8]; 1057 } sse2; 1058 struct { 1059 int32_t mask_table[14]; 1060 } f16c; 1061 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1062 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1063 struct { 1064 XNN_ALIGN(8) uint32_t exp_bias[2]; 1065 XNN_ALIGN(8) float scale_to_inf[2]; 1066 XNN_ALIGN(8) uint32_t expw_max[2]; 1067 XNN_ALIGN(8) float scale_to_zero[2]; 1068 XNN_ALIGN(8) int16_t bias_min[4]; 1069 XNN_ALIGN(8) uint32_t manth_mask[2]; 1070 XNN_ALIGN(8) uint32_t exph_mask[2]; 1071 XNN_ALIGN(8) uint16_t nanh[4]; 1072 } wasmsimd; 1073 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1074 }; 1075 1076 union xnn_f32_qs8_cvt_params { 1077 struct { 1078 float scale; 1079 float output_min_less_zero_point; 1080 float output_max_less_zero_point; 1081 float magic_bias; 1082 int32_t magic_bias_less_zero_point; 1083 } scalar_fmagic; 1084 struct { 1085 float scale; 1086 float magic_bias; 1087 int32_t magic_min; 1088 int32_t magic_max; 1089 int32_t magic_bias_less_zero_point; 1090 } scalar_imagic; 1091 struct { 1092 float scale; 1093 float output_min_less_zero_point; 1094 float output_max_less_zero_point; 1095 int32_t output_zero_point; 1096 } scalar_lrintf; 1097 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1098 struct { 1099 float scale; 1100 float magic_bias; 1101 int32_t magic_bias_less_zero_point; 1102 int8_t output_min; 1103 int8_t output_max; 1104 } neon; 1105 struct { 1106 float scale; 1107 int16_t output_zero_point; 1108 int8_t output_min; 1109 int8_t output_max; 1110 } neonv8; 1111 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1112 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1113 struct { 1114 XNN_ALIGN(16) float scale[4]; 1115 XNN_ALIGN(16) float output_max_less_zero_point[4]; 1116 XNN_ALIGN(16) int16_t output_zero_point[8]; 1117 XNN_ALIGN(16) int16_t output_min[8]; 1118 } sse2; 1119 struct { 1120 XNN_ALIGN(16) float scale[4]; 1121 XNN_ALIGN(16) float output_max_less_zero_point[4]; 1122 XNN_ALIGN(16) int16_t output_zero_point[8]; 1123 XNN_ALIGN(16) int8_t output_min[16]; 1124 } sse4; 1125 struct { 1126 XNN_ALIGN(32) float scale[8]; 1127 XNN_ALIGN(32) float output_max_less_zero_point[8]; 1128 XNN_ALIGN(16) int16_t output_zero_point[8]; 1129 XNN_ALIGN(16) int8_t output_min[16]; 1130 int32_t mask_table[14]; 1131 } avx; 1132 struct { 1133 XNN_ALIGN(32) float scale[8]; 1134 XNN_ALIGN(32) float output_max_less_zero_point[8]; 1135 XNN_ALIGN(32) int16_t output_zero_point[16]; 1136 XNN_ALIGN(32) uint32_t shuffle_mask[8]; 1137 XNN_ALIGN(32) int8_t output_min[32]; 1138 int32_t mask_table[14]; 1139 } avx2; 1140 struct { 1141 XNN_ALIGN(64) float scale[16]; 1142 XNN_ALIGN(64) float output_max_less_zero_point[16]; 1143 XNN_ALIGN(64) int16_t output_zero_point[32]; 1144 XNN_ALIGN(64) int8_t output_min[64]; 1145 XNN_ALIGN(64) uint32_t shuffle512_mask[16]; 1146 XNN_ALIGN(32) uint32_t shuffle256_mask[8]; 1147 } avx512; 1148 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1149 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1150 struct { 1151 XNN_ALIGN(8) float scale[2]; 1152 XNN_ALIGN(8) int16_t output_zero_point[4]; 1153 XNN_ALIGN(8) int8_t output_min[8]; 1154 XNN_ALIGN(8) int8_t output_max[8]; 1155 } wasmsimd_cvt; 1156 struct { 1157 XNN_ALIGN(8) float scale[2]; 1158 XNN_ALIGN(8) float magic_bias[2]; 1159 XNN_ALIGN(8) int32_t magic_min[2]; 1160 XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2]; 1161 XNN_ALIGN(8) int8_t output_max[8]; 1162 } wasmsimd_magic; 1163 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1164 }; 1165 1166 union xnn_f32_qu8_cvt_params { 1167 struct { 1168 float scale; 1169 float output_min_less_zero_point; 1170 float output_max_less_zero_point; 1171 float magic_bias; 1172 int32_t magic_bias_less_zero_point; 1173 } scalar_fmagic; 1174 struct { 1175 float scale; 1176 float magic_bias; 1177 int32_t magic_min; 1178 int32_t magic_max; 1179 int32_t magic_bias_less_zero_point; 1180 } scalar_imagic; 1181 struct { 1182 float scale; 1183 float output_min_less_zero_point; 1184 float output_max_less_zero_point; 1185 int32_t output_zero_point; 1186 } scalar_lrintf; 1187 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1188 struct { 1189 float scale; 1190 float magic_bias; 1191 int32_t magic_bias_less_zero_point; 1192 uint8_t output_min; 1193 uint8_t output_max; 1194 } neon; 1195 struct { 1196 float scale; 1197 int16_t output_zero_point; 1198 uint8_t output_min; 1199 uint8_t output_max; 1200 } neonv8; 1201 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1202 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1203 struct { 1204 XNN_ALIGN(16) float scale[4]; 1205 XNN_ALIGN(16) float output_max_less_zero_point[4]; 1206 XNN_ALIGN(16) int16_t output_zero_point[8]; 1207 XNN_ALIGN(16) uint8_t output_min[16]; 1208 } sse2; 1209 struct { 1210 XNN_ALIGN(32) float scale[8]; 1211 XNN_ALIGN(32) float output_max_less_zero_point[8]; 1212 XNN_ALIGN(16) int16_t output_zero_point[8]; 1213 XNN_ALIGN(16) uint8_t output_min[16]; 1214 int32_t mask_table[14]; 1215 } avx; 1216 struct { 1217 XNN_ALIGN(32) float scale[8]; 1218 XNN_ALIGN(32) float output_max_less_zero_point[8]; 1219 XNN_ALIGN(32) int16_t output_zero_point[16]; 1220 XNN_ALIGN(32) uint32_t shuffle_mask[8]; 1221 XNN_ALIGN(32) uint8_t output_min[32]; 1222 int32_t mask_table[14]; 1223 } avx2; 1224 struct { 1225 XNN_ALIGN(64) float scale[16]; 1226 XNN_ALIGN(64) float output_max_less_zero_point[16]; 1227 XNN_ALIGN(64) int16_t output_zero_point[32]; 1228 XNN_ALIGN(64) uint8_t output_min[64]; 1229 XNN_ALIGN(64) uint32_t shuffle512_mask[16]; 1230 XNN_ALIGN(32) uint32_t shuffle256_mask[8]; 1231 } avx512; 1232 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1233 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1234 struct { 1235 XNN_ALIGN(8) float scale[2]; 1236 XNN_ALIGN(8) int16_t output_zero_point[4]; 1237 XNN_ALIGN(8) uint8_t output_min[8]; 1238 XNN_ALIGN(8) uint8_t output_max[8]; 1239 } wasmsimd_cvt; 1240 struct { 1241 XNN_ALIGN(8) float scale[2]; 1242 XNN_ALIGN(8) float magic_bias[2]; 1243 XNN_ALIGN(8) int32_t magic_min[2]; 1244 XNN_ALIGN(8) int32_t magic_bias_less_zero_point[2]; 1245 XNN_ALIGN(8) uint8_t output_max[8]; 1246 } wasmsimd_magic; 1247 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1248 }; 1249 1250 union xnn_qs8_cvt_params { 1251 struct { 1252 int32_t bias; 1253 int32_t multiplier; 1254 } scalar; 1255 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1256 struct { 1257 uint32_t minus_input_zero_point; 1258 int32_t multiplier; 1259 int32_t bias; 1260 } armsimd32; 1261 struct { 1262 int16_t input_zero_point; 1263 int16_t multiplier; 1264 int16_t output_zero_point; 1265 } neon; 1266 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1267 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1268 struct { 1269 XNN_ALIGN(16) int16_t multiplier[8]; 1270 XNN_ALIGN(16) int32_t bias[4]; 1271 } sse2; 1272 struct { 1273 XNN_ALIGN(16) int16_t input_zero_point[8]; 1274 XNN_ALIGN(16) int16_t multiplier[8]; 1275 XNN_ALIGN(16) int16_t output_zero_point[8]; 1276 } ssse3; 1277 struct { 1278 XNN_ALIGN(32) int16_t input_zero_point[16]; 1279 XNN_ALIGN(32) int16_t multiplier[16]; 1280 XNN_ALIGN(32) int16_t output_zero_point[16]; 1281 } avx2; 1282 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1283 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1284 struct { 1285 XNN_ALIGN(8) int16_t input_zero_point[4]; 1286 XNN_ALIGN(8) int16_t multiplier[4]; 1287 XNN_ALIGN(8) int16_t output_zero_point[4]; 1288 } wasmsimd; 1289 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1290 }; 1291 1292 union xnn_qs8_f32_cvt_params { 1293 struct { 1294 int32_t zero_point; 1295 float scale; 1296 } scalar; 1297 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1298 struct { 1299 int16_t minus_zero_point[2]; 1300 float scale; 1301 } neon; 1302 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1303 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1304 struct { 1305 XNN_ALIGN(16) uint8_t sign_mask[16]; 1306 XNN_ALIGN(16) uint16_t magic_exp[8]; 1307 XNN_ALIGN(16) float magic_bias[4]; 1308 XNN_ALIGN(16) float scale[4]; 1309 } sse2; 1310 struct { 1311 XNN_ALIGN(16) int32_t minus_zero_point[4]; 1312 XNN_ALIGN(16) float scale[4]; 1313 } sse4; 1314 struct { 1315 XNN_ALIGN(32) int32_t minus_zero_point[8]; 1316 XNN_ALIGN(32) float scale[8]; 1317 } avx; 1318 struct { 1319 XNN_ALIGN(64) int32_t minus_zero_point[16]; 1320 XNN_ALIGN(64) float scale[16]; 1321 } avx512; 1322 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1323 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1324 struct { 1325 XNN_ALIGN(8) int16_t minus_zero_point[4]; 1326 XNN_ALIGN(8) float scale[2]; 1327 } wasmsimd; 1328 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1329 }; 1330 1331 union xnn_qu8_cvt_params { 1332 struct { 1333 int32_t bias; 1334 int32_t multiplier; 1335 } scalar; 1336 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1337 struct { 1338 uint32_t minus_input_zero_point; 1339 int32_t multiplier; 1340 int32_t bias; 1341 } armsimd32; 1342 struct { 1343 uint16_t input_zero_point; 1344 int16_t multiplier; 1345 int16_t output_zero_point; 1346 } neon; 1347 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1348 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1349 struct { 1350 XNN_ALIGN(16) uint16_t multiplier[8]; 1351 XNN_ALIGN(16) int32_t bias[4]; 1352 } sse2; 1353 struct { 1354 XNN_ALIGN(16) uint16_t input_zero_point[8]; 1355 XNN_ALIGN(16) int16_t multiplier[8]; 1356 XNN_ALIGN(16) int16_t output_zero_point[8]; 1357 } ssse3; 1358 struct { 1359 XNN_ALIGN(32) uint16_t input_zero_point[16]; 1360 XNN_ALIGN(32) int16_t multiplier[16]; 1361 XNN_ALIGN(32) int16_t output_zero_point[16]; 1362 } avx2; 1363 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1364 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1365 struct { 1366 XNN_ALIGN(8) uint16_t input_zero_point[4]; 1367 XNN_ALIGN(8) int16_t multiplier[4]; 1368 XNN_ALIGN(8) int16_t output_zero_point[4]; 1369 } wasmsimd; 1370 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1371 }; 1372 1373 union xnn_qu8_f32_cvt_params { 1374 struct { 1375 int32_t zero_point; 1376 float scale; 1377 } scalar; 1378 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1379 struct { 1380 int16_t minus_zero_point[2]; 1381 float scale; 1382 } neon; 1383 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1384 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1385 struct { 1386 XNN_ALIGN(16) uint16_t magic_exp[8]; 1387 XNN_ALIGN(16) float magic_bias[4]; 1388 XNN_ALIGN(16) float scale[4]; 1389 } sse2; 1390 struct { 1391 XNN_ALIGN(16) int32_t minus_zero_point[4]; 1392 XNN_ALIGN(16) float scale[4]; 1393 } sse4; 1394 struct { 1395 XNN_ALIGN(32) int32_t minus_zero_point[8]; 1396 XNN_ALIGN(32) float scale[8]; 1397 } avx; 1398 struct { 1399 XNN_ALIGN(64) int32_t minus_zero_point[16]; 1400 XNN_ALIGN(64) float scale[16]; 1401 } avx512; 1402 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1403 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1404 struct { 1405 XNN_ALIGN(8) int16_t minus_zero_point[4]; 1406 XNN_ALIGN(8) float scale[2]; 1407 } wasmsimd; 1408 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1409 }; 1410 1411 1412 // ELU: used by VELU microkernels. 1413 1414 union xnn_f16_elu_params { 1415 char _; // Dummy member variable to comply with the C standard 1416 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1417 struct { 1418 uint16_t prescale; 1419 uint16_t sat_cutoff; 1420 uint16_t magic_bias; 1421 uint16_t log2e; 1422 uint16_t minus_ln2; 1423 uint16_t c3; 1424 uint16_t c2; 1425 uint16_t minus_alpha; 1426 uint16_t beta; 1427 } neonfp16arith_rr1_p3; 1428 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1429 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1430 struct { 1431 XNN_ALIGN(32) float prescale[8]; 1432 XNN_ALIGN(32) float sat_cutoff[8]; 1433 XNN_ALIGN(32) float magic_bias[8]; 1434 XNN_ALIGN(32) float log2e[8]; 1435 XNN_ALIGN(32) float minus_ln2[8]; 1436 XNN_ALIGN(32) float c3[8]; 1437 XNN_ALIGN(32) float c2[8]; 1438 XNN_ALIGN(32) float c1[8]; 1439 XNN_ALIGN(32) float alpha[8]; 1440 XNN_ALIGN(32) float beta[8]; 1441 } avx2_rr1_p3; 1442 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1443 }; 1444 1445 union xnn_f32_elu_params { 1446 struct { 1447 float prescale; 1448 float alpha; 1449 float beta; 1450 float sat_cutoff; 1451 float magic_bias; 1452 float log2e; 1453 float minus_ln2_hi; 1454 float minus_ln2_lo; 1455 float c3; 1456 float c2; 1457 float one; 1458 } scalar_rr2_lut16_p3; 1459 struct { 1460 float prescale; 1461 float alpha; 1462 float beta; 1463 float sat_cutoff; 1464 float magic_bias; 1465 float log2e; 1466 float minus_ln2_hi; 1467 float minus_ln2_lo; 1468 float c6; 1469 float c5; 1470 float c4; 1471 float c3; 1472 float c2; 1473 float one; 1474 } scalar_rr2_p6; 1475 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1476 struct { 1477 float prescale; 1478 float alpha; 1479 float beta; 1480 float sat_cutoff; 1481 float magic_bias; 1482 float log2e; 1483 float minus_ln2_hi; 1484 float minus_ln2_lo; 1485 float c6; 1486 float c5; 1487 float c4; 1488 float c3; 1489 float c2; 1490 } neon_rr2_p6; 1491 struct { 1492 float prescale; 1493 float alpha; 1494 float beta; 1495 float sat_cutoff; 1496 float magic_bias; 1497 float log2e; 1498 float minus_ln2_hi; 1499 float minus_ln2_lo; 1500 float c3; 1501 float c2; 1502 } neon_rr2_lut16_p3; 1503 struct { 1504 float prescale; 1505 float alpha; 1506 float beta; 1507 float sat_cutoff; 1508 float magic_bias; 1509 float log2e; 1510 float minus_ln2; 1511 float c6; 1512 float c5; 1513 float c4; 1514 float c3; 1515 float c2; 1516 } neonfma_rr1_p6; 1517 struct { 1518 float prescale; 1519 float alpha; 1520 float beta; 1521 float sat_cutoff; 1522 float magic_bias; 1523 float log2e; 1524 float minus_ln2; 1525 float c3; 1526 float c2; 1527 } neonfma_rr1_lut16_p3; 1528 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1529 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1530 struct { 1531 XNN_ALIGN(16) float prescale[4]; 1532 XNN_ALIGN(16) float alpha[4]; 1533 XNN_ALIGN(16) float beta[4]; 1534 XNN_ALIGN(16) float sat_cutoff[4]; 1535 XNN_ALIGN(16) float magic_bias[4]; 1536 XNN_ALIGN(16) float log2e[4]; 1537 XNN_ALIGN(16) uint32_t index_mask[4]; 1538 XNN_ALIGN(16) float minus_ln2_hi[4]; 1539 XNN_ALIGN(16) float minus_ln2_lo[4]; 1540 XNN_ALIGN(16) float c3[4]; 1541 XNN_ALIGN(16) float c2[4]; 1542 XNN_ALIGN(16) float one[4]; 1543 } sse2_rr2_lut16_p3; 1544 struct { 1545 XNN_ALIGN(16) float prescale[4]; 1546 XNN_ALIGN(16) float alpha[4]; 1547 XNN_ALIGN(16) float beta[4]; 1548 XNN_ALIGN(16) float sat_cutoff[4]; 1549 XNN_ALIGN(16) float magic_bias[4]; 1550 XNN_ALIGN(16) float log2e[4]; 1551 XNN_ALIGN(16) float minus_ln2_hi[4]; 1552 XNN_ALIGN(16) float minus_ln2_lo[4]; 1553 XNN_ALIGN(16) float c6[4]; 1554 XNN_ALIGN(16) float c5[4]; 1555 XNN_ALIGN(16) float c4[4]; 1556 XNN_ALIGN(16) float c3[4]; 1557 XNN_ALIGN(16) float c2[4]; 1558 XNN_ALIGN(16) float one[4]; 1559 } sse2_rr2_p6; 1560 struct { 1561 XNN_ALIGN(32) float prescale[8]; 1562 XNN_ALIGN(32) float alpha[8]; 1563 XNN_ALIGN(32) float beta[8]; 1564 XNN_ALIGN(32) float sat_cutoff[8]; 1565 XNN_ALIGN(32) float magic_bias[8]; 1566 XNN_ALIGN(32) float log2e[8]; 1567 XNN_ALIGN(32) uint32_t index_mask[8]; 1568 XNN_ALIGN(32) float minus_ln2_hi[8]; 1569 XNN_ALIGN(32) float minus_ln2_lo[8]; 1570 XNN_ALIGN(32) float c3[8]; 1571 XNN_ALIGN(32) float c2[8]; 1572 XNN_ALIGN(32) float one[8]; 1573 int32_t mask_table[14]; 1574 } avx_rr2_lut16_p3; 1575 struct { 1576 XNN_ALIGN(32) float prescale[8]; 1577 XNN_ALIGN(32) float alpha[8]; 1578 XNN_ALIGN(32) float beta[8]; 1579 XNN_ALIGN(32) float sat_cutoff[8]; 1580 XNN_ALIGN(32) float magic_bias[8]; 1581 XNN_ALIGN(32) float log2e[8]; 1582 XNN_ALIGN(32) uint32_t index_mask[8]; 1583 XNN_ALIGN(32) float table[8]; 1584 XNN_ALIGN(32) float minus_ln2_hi[8]; 1585 XNN_ALIGN(32) float minus_ln2_lo[8]; 1586 XNN_ALIGN(32) float c4[8]; 1587 XNN_ALIGN(32) float c3[8]; 1588 XNN_ALIGN(32) float c2[8]; 1589 XNN_ALIGN(32) float one[8]; 1590 int32_t mask_table[14]; 1591 } avx_rr2_lut4_p4; 1592 struct { 1593 XNN_ALIGN(32) float prescale[8]; 1594 XNN_ALIGN(32) float alpha[8]; 1595 XNN_ALIGN(32) float beta[8]; 1596 XNN_ALIGN(32) float sat_cutoff[8]; 1597 XNN_ALIGN(32) float magic_bias[8]; 1598 XNN_ALIGN(32) float log2e[8]; 1599 XNN_ALIGN(32) float minus_ln2_hi[8]; 1600 XNN_ALIGN(32) float minus_ln2_lo[8]; 1601 XNN_ALIGN(32) float c6[8]; 1602 XNN_ALIGN(32) float c5[8]; 1603 XNN_ALIGN(32) float c4[8]; 1604 XNN_ALIGN(32) float c3[8]; 1605 XNN_ALIGN(32) float c2[8]; 1606 XNN_ALIGN(32) float one[8]; 1607 int32_t mask_table[14]; 1608 } avx_rr2_p6; 1609 struct { 1610 XNN_ALIGN(32) float prescale[8]; 1611 XNN_ALIGN(32) float alpha[8]; 1612 XNN_ALIGN(32) float beta[8]; 1613 XNN_ALIGN(32) float sat_cutoff[8]; 1614 XNN_ALIGN(32) float magic_bias[8]; 1615 XNN_ALIGN(32) float log2e[8]; 1616 XNN_ALIGN(32) uint32_t index_mask[8]; 1617 XNN_ALIGN(32) float minus_ln2[8]; 1618 XNN_ALIGN(32) float c3[8]; 1619 XNN_ALIGN(32) float c2[8]; 1620 int32_t mask_table[14]; 1621 } avx2_rr1_lut16_p3; 1622 struct { 1623 XNN_ALIGN(32) float prescale[8]; 1624 XNN_ALIGN(32) float alpha[8]; 1625 XNN_ALIGN(32) float beta[8]; 1626 XNN_ALIGN(32) float sat_cutoff[8]; 1627 XNN_ALIGN(32) float magic_bias[8]; 1628 XNN_ALIGN(32) float log2e[8]; 1629 XNN_ALIGN(32) uint32_t table[8]; 1630 XNN_ALIGN(32) float minus_ln2[8]; 1631 XNN_ALIGN(32) float c4[8]; 1632 XNN_ALIGN(32) float c3[8]; 1633 XNN_ALIGN(32) float c2[8]; 1634 int32_t mask_table[14]; 1635 } avx2_rr1_lut8_p4; 1636 struct { 1637 XNN_ALIGN(32) float prescale[8]; 1638 XNN_ALIGN(32) float alpha[8]; 1639 XNN_ALIGN(32) float beta[8]; 1640 XNN_ALIGN(32) float sat_cutoff[8]; 1641 XNN_ALIGN(32) float magic_bias[8]; 1642 XNN_ALIGN(32) float log2e[8]; 1643 XNN_ALIGN(32) float table[8]; 1644 XNN_ALIGN(32) float minus_ln2[8]; 1645 XNN_ALIGN(32) float c4[8]; 1646 XNN_ALIGN(32) float c3[8]; 1647 XNN_ALIGN(32) float c2[8]; 1648 int32_t mask_table[14]; 1649 } avx2_rr1_lut4_p4; 1650 struct { 1651 XNN_ALIGN(32) float prescale[8]; 1652 XNN_ALIGN(32) float alpha[8]; 1653 XNN_ALIGN(32) float beta[8]; 1654 XNN_ALIGN(32) float sat_cutoff[8]; 1655 XNN_ALIGN(32) float magic_bias[8]; 1656 XNN_ALIGN(32) float log2e[8]; 1657 XNN_ALIGN(32) float minus_ln2[8]; 1658 XNN_ALIGN(32) float c6[8]; 1659 XNN_ALIGN(32) float c5[8]; 1660 XNN_ALIGN(32) float c4[8]; 1661 XNN_ALIGN(32) float c3[8]; 1662 XNN_ALIGN(32) float c2[8]; 1663 int32_t mask_table[14]; 1664 } avx2_rr1_p6; 1665 struct { 1666 float prescale; 1667 float alpha; 1668 float beta; 1669 float sat_cutoff; 1670 float magic_bias; 1671 float log2e; 1672 float minus_ln2; 1673 float c3; 1674 float c2; 1675 XNN_ALIGN(64) uint32_t table[16]; 1676 } avx512_rr1_lut16_p3; 1677 struct { 1678 float prescale; 1679 float alpha; 1680 float beta; 1681 float sat_cutoff; 1682 float magic_bias; 1683 float log2e; 1684 float minus_ln2; 1685 float c6; 1686 float c5; 1687 float c4; 1688 float c3; 1689 float c2; 1690 } avx512_rr1_p6; 1691 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1692 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1693 struct { 1694 XNN_ALIGN(8) float prescale[2]; 1695 XNN_ALIGN(8) float alpha[2]; 1696 XNN_ALIGN(8) float beta[2]; 1697 XNN_ALIGN(8) float sat_cutoff[2]; 1698 XNN_ALIGN(8) float magic_bias[2]; 1699 XNN_ALIGN(8) float log2e[2]; 1700 XNN_ALIGN(8) uint32_t index_mask[2]; 1701 XNN_ALIGN(8) float minus_ln2_hi[2]; 1702 XNN_ALIGN(8) float minus_ln2_lo[2]; 1703 XNN_ALIGN(8) float c3[2]; 1704 XNN_ALIGN(8) float c2[2]; 1705 XNN_ALIGN(8) float one[2]; 1706 } wasmsimd_rr2_lut16_p3; 1707 struct { 1708 XNN_ALIGN(8) float prescale[2]; 1709 XNN_ALIGN(8) float alpha[2]; 1710 XNN_ALIGN(8) float beta[2]; 1711 XNN_ALIGN(8) float sat_cutoff[2]; 1712 XNN_ALIGN(8) float magic_bias[2]; 1713 XNN_ALIGN(8) float log2e[2]; 1714 XNN_ALIGN(8) float minus_ln2_hi[2]; 1715 XNN_ALIGN(8) float minus_ln2_lo[2]; 1716 XNN_ALIGN(8) float c6[2]; 1717 XNN_ALIGN(8) float c5[2]; 1718 XNN_ALIGN(8) float c4[2]; 1719 XNN_ALIGN(8) float c3[2]; 1720 XNN_ALIGN(8) float c2[2]; 1721 XNN_ALIGN(8) float one[2]; 1722 } wasmsimd_rr2_p6; 1723 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1724 }; 1725 1726 1727 // ExpMinus: used by RADDEXPMINUSMAX microkernels. 1728 1729 union xnn_f16_expminus_params { 1730 char _; // Dummy member variable to comply with the C standard 1731 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1732 struct { 1733 uint16_t magic_bias; 1734 uint16_t log2e; 1735 uint16_t minus_ln2_hi; 1736 uint16_t minus_ln2_lo; 1737 uint16_t c2; 1738 uint16_t c1; 1739 uint16_t denorm_cutoff; 1740 } neonfp16arith_rr2_p2; 1741 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1742 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1743 struct { 1744 XNN_ALIGN(32) float magic_bias[8]; 1745 XNN_ALIGN(32) float log2e[8]; 1746 XNN_ALIGN(32) float minus_ln2[8]; 1747 XNN_ALIGN(32) float c2[8]; 1748 XNN_ALIGN(32) float c1[8]; 1749 XNN_ALIGN(32) float denorm_cutoff[8]; 1750 } avx2_rr1_p2; 1751 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1752 }; 1753 1754 union xnn_f32_expminus_params { 1755 struct { 1756 float log2e; 1757 float magic_bias; 1758 float minus_ln2_hi; 1759 float minus_ln2_lo; 1760 float c5; 1761 float c4; 1762 float c3; 1763 float c2; 1764 float c1; 1765 float denorm_cutoff; 1766 } scalar_rr2_p5; 1767 struct { 1768 float log2e; 1769 float magic_bias; 1770 float minus_ln2_hi; 1771 float minus_ln2_lo; 1772 float c2; 1773 float denorm_cutoff; 1774 } scalar_rr2_lut64_p2; 1775 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1776 struct { 1777 float log2e; 1778 float magic_bias; 1779 float minus_ln2_hi; 1780 float minus_ln2_lo; 1781 float c5; 1782 float c4; 1783 float c3; 1784 float c2; 1785 float c1; 1786 float denorm_cutoff; 1787 } neon_rr2_p5; 1788 struct { 1789 float log2e; 1790 float magic_bias; 1791 float minus_ln2_hi; 1792 float minus_ln2_lo; 1793 float c2; 1794 float denorm_cutoff; 1795 } neon_rr2_lut64_p2; 1796 struct { 1797 float log2e; 1798 float magic_bias; 1799 float minus_ln2; 1800 float c5; 1801 float c4; 1802 float c3; 1803 float c2; 1804 float c1; 1805 float denorm_cutoff; 1806 } neonfma_rr1_p5; 1807 struct { 1808 float log2e; 1809 float magic_bias; 1810 float minus_ln2; 1811 float c2; 1812 float denorm_cutoff; 1813 } neonfma_rr1_lut64_p2; 1814 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1815 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1816 struct { 1817 XNN_ALIGN(16) float log2e[4]; 1818 XNN_ALIGN(16) float magic_bias[4]; 1819 XNN_ALIGN(16) float minus_ln2_hi[4]; 1820 XNN_ALIGN(16) float minus_ln2_lo[4]; 1821 XNN_ALIGN(16) float c5[4]; 1822 XNN_ALIGN(16) float c4[4]; 1823 XNN_ALIGN(16) float c3[4]; 1824 XNN_ALIGN(16) float c2[4]; 1825 XNN_ALIGN(16) float c1[4]; 1826 XNN_ALIGN(16) float denorm_cutoff[4]; 1827 } sse2_rr2_p5; 1828 struct { 1829 XNN_ALIGN(32) float log2e[8]; 1830 XNN_ALIGN(32) float magic_bias[8]; 1831 XNN_ALIGN(32) float minus_ln2[8]; 1832 XNN_ALIGN(32) float c5[8]; 1833 XNN_ALIGN(32) float c4[8]; 1834 XNN_ALIGN(32) float c3[8]; 1835 XNN_ALIGN(32) float c2[8]; 1836 XNN_ALIGN(32) float c1[8]; 1837 XNN_ALIGN(32) float denorm_cutoff[8]; 1838 int32_t mask_table[14]; 1839 } avx2_rr1_p5; 1840 struct { 1841 float log2e; 1842 float minus_ln2; 1843 float c5; 1844 float c4; 1845 float c3; 1846 float c2; 1847 float c1; 1848 float c0; 1849 } avx512_rr1_p5; 1850 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1851 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1852 struct { 1853 XNN_ALIGN(8) float log2e[2]; 1854 XNN_ALIGN(8) float magic_bias[2]; 1855 XNN_ALIGN(8) float minus_ln2_hi[2]; 1856 XNN_ALIGN(8) float minus_ln2_lo[2]; 1857 XNN_ALIGN(8) float c5[2]; 1858 XNN_ALIGN(8) float c4[2]; 1859 XNN_ALIGN(8) float c3[2]; 1860 XNN_ALIGN(8) float c2[2]; 1861 XNN_ALIGN(8) float c1[2]; 1862 XNN_ALIGN(8) float denorm_cutoff[2]; 1863 } wasmsimd_rr2_p5; 1864 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1865 }; 1866 1867 1868 // HSwish: used by VHSWISH microkernels. 1869 1870 union xnn_f16_hswish_params { 1871 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1872 struct { 1873 uint16_t sixth; 1874 uint16_t three; 1875 uint16_t six; 1876 uint16_t pad; // pad to 8 bytes for neonfp16arith assembly. 1877 } neon; 1878 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 */ 1879 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1880 struct { 1881 XNN_ALIGN(32) float sixth[8]; 1882 XNN_ALIGN(32) float three[8]; 1883 XNN_ALIGN(16) uint16_t six[8]; 1884 } avx; 1885 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1886 }; 1887 1888 union xnn_f32_hswish_params { 1889 struct { 1890 float sixth; 1891 float three; 1892 float six; 1893 } scalar; 1894 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1895 struct { 1896 XNN_ALIGN(16) float sixth[4]; 1897 XNN_ALIGN(16) float half[4]; 1898 XNN_ALIGN(16) float one[4]; 1899 } sse; 1900 struct { 1901 XNN_ALIGN(32) float sixth[8]; 1902 XNN_ALIGN(32) float half[8]; 1903 XNN_ALIGN(32) float one[8]; 1904 int32_t mask_table[14]; 1905 } avx; 1906 struct { 1907 float sixth; 1908 float half; 1909 float one; 1910 } avx512; 1911 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1912 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1913 struct { 1914 XNN_ALIGN(8) float sixth[2]; 1915 XNN_ALIGN(8) float three[2]; 1916 XNN_ALIGN(8) float six[2]; 1917 } wasmsimd; 1918 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1919 }; 1920 1921 1922 // LReLU (Leaky ReLU): used by VLRELU microkernels. 1923 1924 union xnn_f16_lrelu_params { 1925 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1926 struct { 1927 uint16_t slope; 1928 } neon; 1929 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1930 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1931 struct { 1932 XNN_ALIGN(32) float slope[8]; 1933 } avx; 1934 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1935 }; 1936 1937 union xnn_f32_lrelu_params { 1938 struct { 1939 float slope; 1940 } scalar; 1941 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1942 struct { 1943 XNN_ALIGN(16) float slope[4]; 1944 } sse; 1945 struct { 1946 XNN_ALIGN(32) float slope[8]; 1947 int32_t mask_table[14]; 1948 } avx; 1949 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 1950 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1951 struct { 1952 XNN_ALIGN(8) float slope[2]; 1953 } wasmsimd; 1954 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 1955 }; 1956 1957 union xnn_qs8_lrelu_params { 1958 struct { 1959 int32_t input_zero_point; 1960 int32_t positive_multiplier; 1961 int32_t negative_multiplier; 1962 int32_t bias; 1963 } scalar_select; 1964 struct { 1965 int32_t input_zero_point; 1966 int32_t multiplier_diff; 1967 int32_t multiplier_base; 1968 int32_t bias; 1969 } scalar_andxor; 1970 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 1971 struct { 1972 uint32_t input_zero_point; 1973 uint32_t positive_multiplier; 1974 uint32_t negative_multiplier; 1975 int32_t bias; 1976 } armsimd32; 1977 struct { 1978 int16_t input_zero_point; 1979 int16_t positive_multiplier; 1980 int16_t negative_multiplier; 1981 int16_t output_zero_point; 1982 } neon; 1983 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 1984 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 1985 struct { 1986 XNN_ALIGN(16) int16_t input_zero_point[8]; 1987 XNN_ALIGN(16) int16_t multiplier_diff[8]; 1988 XNN_ALIGN(16) int16_t multiplier_base[8]; 1989 XNN_ALIGN(16) int16_t output_zero_point[8]; 1990 } sse2; 1991 struct { 1992 XNN_ALIGN(16) int16_t input_zero_point[8]; 1993 XNN_ALIGN(16) int16_t positive_multiplier[8]; 1994 XNN_ALIGN(16) int16_t negative_multiplier[8]; 1995 XNN_ALIGN(16) int16_t output_zero_point[8]; 1996 } avx; 1997 struct { 1998 XNN_ALIGN(32) int16_t input_zero_point[16]; 1999 XNN_ALIGN(32) int16_t positive_multiplier[16]; 2000 XNN_ALIGN(32) int16_t negative_multiplier[16]; 2001 XNN_ALIGN(32) int16_t output_zero_point[16]; 2002 } avx2; 2003 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2004 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 2005 struct { 2006 XNN_ALIGN(8) int16_t input_zero_point[4]; 2007 XNN_ALIGN(8) int16_t positive_multiplier[4]; 2008 XNN_ALIGN(8) int16_t negative_multiplier[4]; 2009 XNN_ALIGN(8) int16_t output_zero_point[4]; 2010 } wasmsimd_arm; 2011 struct { 2012 XNN_ALIGN(8) int16_t input_zero_point[4]; 2013 XNN_ALIGN(8) int16_t multiplier_diff[4]; 2014 XNN_ALIGN(8) int16_t multiplier_base[4]; 2015 XNN_ALIGN(8) int16_t output_zero_point[4]; 2016 } wasmsimd_x86; 2017 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 2018 }; 2019 2020 union xnn_qu8_lrelu_params { 2021 struct { 2022 int32_t input_zero_point; 2023 int32_t positive_multiplier; 2024 int32_t negative_multiplier; 2025 int32_t bias; 2026 } scalar_select; 2027 struct { 2028 int32_t input_zero_point; 2029 int32_t multiplier_base; 2030 int32_t multiplier_diff; 2031 int32_t bias; 2032 } scalar_andxor; 2033 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 2034 struct { 2035 uint32_t input_zero_point; 2036 uint32_t positive_multiplier; 2037 uint32_t negative_multiplier; 2038 int32_t bias; 2039 } armsimd32; 2040 struct { 2041 uint16_t input_zero_point; 2042 int16_t positive_multiplier; 2043 int16_t negative_multiplier; 2044 int16_t output_zero_point; 2045 } neon; 2046 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 2047 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2048 struct { 2049 XNN_ALIGN(16) int16_t input_zero_point[8]; 2050 XNN_ALIGN(16) int16_t multiplier_diff[8]; 2051 XNN_ALIGN(16) int16_t multiplier_base[8]; 2052 XNN_ALIGN(16) int16_t output_zero_point[8]; 2053 } sse2; 2054 struct { 2055 XNN_ALIGN(16) int16_t input_zero_point[8]; 2056 XNN_ALIGN(16) int16_t positive_multiplier[8]; 2057 XNN_ALIGN(16) int16_t negative_multiplier[8]; 2058 XNN_ALIGN(16) int16_t output_zero_point[8]; 2059 } avx; 2060 struct { 2061 XNN_ALIGN(32) int16_t input_zero_point[16]; 2062 XNN_ALIGN(32) int16_t positive_multiplier[16]; 2063 XNN_ALIGN(32) int16_t negative_multiplier[16]; 2064 XNN_ALIGN(32) int16_t output_zero_point[16]; 2065 } avx2; 2066 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2067 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 2068 struct { 2069 XNN_ALIGN(8) int16_t input_zero_point[4]; 2070 XNN_ALIGN(8) int16_t positive_multiplier[4]; 2071 XNN_ALIGN(8) int16_t negative_multiplier[4]; 2072 XNN_ALIGN(8) int16_t output_zero_point[4]; 2073 } wasmsimd_arm; 2074 struct { 2075 XNN_ALIGN(8) int16_t input_zero_point[4]; 2076 XNN_ALIGN(8) int16_t multiplier_diff[4]; 2077 XNN_ALIGN(8) int16_t multiplier_base[4]; 2078 XNN_ALIGN(8) int16_t output_zero_point[4]; 2079 } wasmsimd_x86; 2080 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 2081 }; 2082 2083 2084 // Neg: used by VNEG microkernels. 2085 2086 union xnn_f16_neg_params { 2087 char _; // Dummy member variable to comply with the C standard 2088 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2089 struct { 2090 XNN_ALIGN(16) uint16_t sign_mask[8]; 2091 } sse; 2092 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2093 }; 2094 2095 union xnn_f32_neg_params { 2096 char _; // Dummy member variable to comply with the C standard 2097 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2098 struct { 2099 XNN_ALIGN(16) float sign_mask[4]; 2100 } sse; 2101 struct { 2102 XNN_ALIGN(32) float sign_mask[8]; 2103 int32_t mask_table[14]; 2104 } avx; 2105 struct { 2106 uint32_t sign_mask; 2107 } avx512; 2108 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2109 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 2110 struct { 2111 XNN_ALIGN(8) float sign_mask[2]; 2112 } wasmsimd; 2113 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 2114 }; 2115 2116 2117 // Rnd (Round): used by VRNDNE/VRNDU/VRNDD/VRNDZ microkernels. 2118 2119 union xnn_f16_rnd_params { 2120 char _; // Dummy member variable to comply with the C standard 2121 }; 2122 2123 union xnn_f32_rnd_params { 2124 char _; // Dummy member variable to comply with the C standard 2125 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2126 struct { 2127 XNN_ALIGN(16) float sign_mask[4]; 2128 XNN_ALIGN(16) float one[4]; 2129 } sse2; 2130 struct { 2131 int32_t mask_table[14]; 2132 } avx; 2133 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2134 }; 2135 2136 2137 // Sigmoid: used by VSIGMOID microkernels. 2138 2139 union xnn_f16_sigmoid_params { 2140 char _; // Dummy member variable to comply with the C standard 2141 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 2142 struct { 2143 uint16_t magic_bias; 2144 uint16_t minus_log2e; 2145 uint16_t ln2_hi; 2146 uint16_t ln2_lo; 2147 uint16_t c2; 2148 uint16_t c1; 2149 uint16_t denorm_cutoff; 2150 } neonfp16arith_rr2_p2; 2151 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 2152 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2153 struct { 2154 XNN_ALIGN(32) float sign_mask[8]; 2155 XNN_ALIGN(32) float magic_bias[8]; 2156 XNN_ALIGN(32) float log2e[8]; 2157 XNN_ALIGN(32) float minus_ln2[8]; 2158 XNN_ALIGN(32) float c2[8]; 2159 XNN_ALIGN(32) float c1[8]; 2160 XNN_ALIGN(32) float one[8]; 2161 XNN_ALIGN(32) float denorm_cutoff[8]; 2162 } avx2_rr1_p2; 2163 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2164 }; 2165 2166 union xnn_f32_sigmoid_params { 2167 struct { 2168 float magic_bias; 2169 float minus_log2e; 2170 float ln2_hi; 2171 float ln2_lo; 2172 float c1; 2173 float one; 2174 float denorm_cutoff; 2175 } scalar_rr2_lut2048_p1; 2176 struct { 2177 float magic_bias; 2178 float minus_log2e; 2179 float ln2_hi; 2180 float ln2_lo; 2181 float c2; 2182 float one; 2183 float denorm_cutoff; 2184 } scalar_rr2_lut64_p2; 2185 struct { 2186 float magic_bias; 2187 float minus_log2e; 2188 float ln2_hi; 2189 float ln2_lo; 2190 float c5; 2191 float c4; 2192 float c3; 2193 float c2; 2194 float c1; 2195 float one; 2196 float denorm_cutoff; 2197 } scalar_rr2_p5; 2198 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 2199 struct { 2200 float magic_bias; 2201 float minus_log2e; 2202 float ln2_hi; 2203 float ln2_lo; 2204 float c1; 2205 float denorm_cutoff; 2206 } neon_rr2_lut2048_p1; 2207 struct { 2208 float magic_bias; 2209 float minus_log2e; 2210 float ln2_hi; 2211 float ln2_lo; 2212 float c2; 2213 float denorm_cutoff; 2214 } neon_rr2_lut64_p2; 2215 struct { 2216 float magic_bias; 2217 float minus_log2e; 2218 float ln2_hi; 2219 float ln2_lo; 2220 float c5; 2221 float c4; 2222 float c3; 2223 float c2; 2224 float c1; 2225 float denorm_cutoff; 2226 } neon_rr2_p5; 2227 struct { 2228 float magic_bias; 2229 float minus_log2e; 2230 float ln2; 2231 float c1; 2232 float denorm_cutoff; 2233 } neonfma_rr1_lut2048_p1; 2234 struct { 2235 float magic_bias; 2236 float minus_log2e; 2237 float ln2; 2238 float c2; 2239 float denorm_cutoff; 2240 } neonfma_rr1_lut64_p2; 2241 struct { 2242 float magic_bias; 2243 float minus_log2e; 2244 float ln2; 2245 float c5; 2246 float c4; 2247 float c3; 2248 float c2; 2249 float c1; 2250 float denorm_cutoff; 2251 } neonfma_rr1_p5; 2252 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 2253 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2254 struct { 2255 XNN_ALIGN(16) float sign_mask[4]; 2256 XNN_ALIGN(16) float magic_bias[4]; 2257 XNN_ALIGN(16) float log2e[4]; 2258 XNN_ALIGN(16) uint32_t index_mask[4]; 2259 XNN_ALIGN(16) float minus_ln2_hi[4]; 2260 XNN_ALIGN(16) float minus_ln2_lo[4]; 2261 XNN_ALIGN(16) float c2[4]; 2262 XNN_ALIGN(16) float one[4]; 2263 XNN_ALIGN(16) float denorm_cutoff[4]; 2264 } sse2_rr2_lut64_p2; 2265 struct { 2266 XNN_ALIGN(16) float sign_mask[4]; 2267 XNN_ALIGN(16) float magic_bias[4]; 2268 XNN_ALIGN(16) float log2e[4]; 2269 XNN_ALIGN(16) float minus_ln2_hi[4]; 2270 XNN_ALIGN(16) float minus_ln2_lo[4]; 2271 XNN_ALIGN(16) float c5[4]; 2272 XNN_ALIGN(16) float c4[4]; 2273 XNN_ALIGN(16) float c3[4]; 2274 XNN_ALIGN(16) float c2[4]; 2275 XNN_ALIGN(16) float c1[4]; 2276 XNN_ALIGN(16) float one[4]; 2277 XNN_ALIGN(16) float denorm_cutoff[4]; 2278 } sse2_rr2_p5; 2279 struct { 2280 XNN_ALIGN(32) float sign_mask[8]; 2281 XNN_ALIGN(32) float magic_bias[8]; 2282 XNN_ALIGN(32) float log2e[8]; 2283 XNN_ALIGN(32) float minus_ln2_hi[8]; 2284 XNN_ALIGN(32) float minus_ln2_lo[8]; 2285 XNN_ALIGN(32) float c5[8]; 2286 XNN_ALIGN(32) float c4[8]; 2287 XNN_ALIGN(32) float c3[8]; 2288 XNN_ALIGN(32) float c2[8]; 2289 XNN_ALIGN(32) float c1[8]; 2290 XNN_ALIGN(32) float one[8]; 2291 XNN_ALIGN(32) float two[8]; 2292 XNN_ALIGN(32) float denorm_cutoff[8]; 2293 int32_t mask_table[14]; 2294 } avx_rr2_p5; 2295 struct { 2296 XNN_ALIGN(32) float sign_mask[8]; 2297 XNN_ALIGN(32) float magic_bias[8]; 2298 XNN_ALIGN(32) float log2e[8]; 2299 XNN_ALIGN(32) float minus_ln2[8]; 2300 XNN_ALIGN(32) float c5[8]; 2301 XNN_ALIGN(32) float c4[8]; 2302 XNN_ALIGN(32) float c3[8]; 2303 XNN_ALIGN(32) float c2[8]; 2304 XNN_ALIGN(32) float c1[8]; 2305 XNN_ALIGN(32) float one[8]; 2306 XNN_ALIGN(32) float denorm_cutoff[8]; 2307 int32_t mask_table[14]; 2308 } avx2_rr1_p5; 2309 struct { 2310 uint32_t sign_mask; 2311 float magic_bias; 2312 float log2e; 2313 float minus_ln2; 2314 float c3; 2315 float c2; 2316 float one; 2317 XNN_ALIGN(64) float table[16]; 2318 } avx512_rr1_lut16_p3; 2319 struct { 2320 uint32_t sign_mask; 2321 float magic_bias; 2322 float log2e; 2323 float minus_ln2_hi; 2324 float minus_ln2_lo; 2325 float c2; 2326 float c1; 2327 float one; 2328 XNN_ALIGN(64) float table_lo[16]; 2329 XNN_ALIGN(64) float table_hi[16]; 2330 } avx512_rr2_lut32_p2; 2331 struct { 2332 uint32_t sign_mask; 2333 float log2e; 2334 float minus_ln2; 2335 float c5; 2336 float c4; 2337 float c3; 2338 float c2; 2339 float c1; 2340 float one; 2341 } avx512_rr1_p5; 2342 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2343 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 2344 struct { 2345 XNN_ALIGN(8) float magic_bias[2]; 2346 XNN_ALIGN(8) float minus_log2e[2]; 2347 XNN_ALIGN(8) uint32_t index_mask[2]; 2348 XNN_ALIGN(8) float ln2_hi[2]; 2349 XNN_ALIGN(8) float ln2_lo[2]; 2350 XNN_ALIGN(8) float c2[2]; 2351 XNN_ALIGN(8) float one[2]; 2352 XNN_ALIGN(8) float denorm_cutoff[2]; 2353 } wasmsimd_rr2_lut64_p2; 2354 struct { 2355 XNN_ALIGN(8) float magic_bias[2]; 2356 XNN_ALIGN(8) float minus_log2e[2]; 2357 XNN_ALIGN(8) float ln2_hi[2]; 2358 XNN_ALIGN(8) float ln2_lo[2]; 2359 XNN_ALIGN(8) float c5[2]; 2360 XNN_ALIGN(8) float c4[2]; 2361 XNN_ALIGN(8) float c3[2]; 2362 XNN_ALIGN(8) float c2[2]; 2363 XNN_ALIGN(8) float c1[2]; 2364 XNN_ALIGN(8) float one[2]; 2365 XNN_ALIGN(8) float denorm_cutoff[2]; 2366 } wasmsimd_rr2_p5; 2367 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD 2368 }; 2369 2370 2371 // Sqrt (Square Root): used by VSQRT microkernels. 2372 2373 union xnn_f16_sqrt_params { 2374 char _; // Dummy member variable to comply with the C standard 2375 }; 2376 2377 union xnn_f32_sqrt_params { 2378 char _; // Dummy member variable to comply with the C standard 2379 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2380 struct { 2381 int32_t mask_table[14]; 2382 } avx; 2383 struct { 2384 XNN_ALIGN(32) float half[8]; 2385 int32_t mask_table[14]; 2386 } fma; 2387 struct { 2388 float half; 2389 } avx512; 2390 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2391 }; 2392 2393 2394 // SqrtShift (Square Root + Shift): used by VSQRTSHIFT microkernels. 2395 2396 union xnn_u64_u32_sqrtshift_params { 2397 struct { 2398 uint32_t shift; 2399 } scalar; 2400 }; 2401 2402 // CHW: used by CONV/DWCONV microkernels in CHW layout with Min+Max parameters. 2403 2404 union xnn_f16_chw_params { 2405 char _; // Dummy member variable to comply with the C standard 2406 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 2407 struct { 2408 uint16_t min; 2409 uint16_t max; 2410 XNN_ALIGN(8) uint16_t mask_even[4]; // used by stride 2 kernels 2411 XNN_ALIGN(8) uint16_t mask_odd[4]; // used by stride 2 kernels 2412 XNN_ALIGN(8) uint16_t mask[4]; // used by stride 1 kernels 2413 XNN_ALIGN(16) uint16_t maskx8[8]; // used by stride 1 x8 kernels 2414 } neonfp16arith; 2415 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 2416 }; 2417 2418 union xnn_f32_chw_params { 2419 struct { 2420 XNN_ALIGN(16) int32_t mask_even[4]; // used by stride 2 kernels 2421 XNN_ALIGN(16) int32_t mask_odd[4]; // used by stride 2 kernels 2422 XNN_ALIGN(16) int32_t mask[4]; // used by stride 1 kernels 2423 float min; 2424 float max; 2425 } scalar; 2426 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 2427 struct { 2428 float min; 2429 float max; 2430 XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels 2431 XNN_ALIGN(16) uint32_t mask_odd[4]; // used by stride 2 kernels 2432 XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels 2433 } neon; 2434 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 2435 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2436 struct { 2437 XNN_ALIGN(16) float min[4]; 2438 XNN_ALIGN(16) float max[4]; 2439 XNN_ALIGN(16) uint32_t mask_even[4]; // used by stride 2 kernels 2440 XNN_ALIGN(16) uint32_t mask_odd[4]; // used by stride 2 kernels 2441 XNN_ALIGN(16) uint32_t mask[4]; // used by stride 1 kernels 2442 } sse; 2443 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2444 }; 2445 2446 2447 // GAvgPool (Global Average Pool): used by GAVGPOOL microkernels in CHW layout with Scale+Min+Max parameters. 2448 2449 union xnn_f16_gavgpool_params { 2450 char _; // Dummy member variable to comply with the C standard 2451 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 2452 struct { 2453 XNN_ALIGN(16) uint16_t mask[8]; 2454 uint16_t multiplier; 2455 uint16_t output_min; 2456 uint16_t output_max; 2457 } neonfp16arith; 2458 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 */ 2459 }; 2460 2461 union xnn_f32_gavgpool_params { 2462 struct { 2463 XNN_ALIGN(16) int32_t mask[4]; 2464 float multiplier; 2465 float output_min; 2466 float output_max; 2467 } scalar; 2468 #if XNN_ARCH_X86 || XNN_ARCH_X86_64 2469 struct { 2470 XNN_ALIGN(16) float multiplier[4]; 2471 XNN_ALIGN(16) float output_min[4]; 2472 XNN_ALIGN(16) float output_max[4]; 2473 XNN_ALIGN(16) uint32_t mask[4]; 2474 } sse; 2475 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 2476 #if XNN_ARCH_ARM || XNN_ARCH_ARM64 2477 struct { 2478 XNN_ALIGN(16) uint32_t mask[4]; 2479 float multiplier; 2480 float output_min; 2481 float output_max; 2482 } neon; 2483 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 */ 2484 }; 2485 2486 // Forward declare for use in microkernel headers for JIT generator functions. 2487 struct xnn_code_buffer; 2488 2489 typedef int xnn_status_t; 2490