/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c16-minmax-rndnu-neon-mlal.c | 124 const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() local 237 int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 238 int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 239 int16x8_t vprod2x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 240 vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 241 vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 242 vprod2x12 = vmlal_s8(vprod2x12, vget_high_s8(vb12), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal()
|
D | 4x16c16-minmax-rndnu-neon-mlal.c | 147 const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() local 296 int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 297 int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 298 int16x8_t vprod2x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 299 int16x8_t vprod3x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 300 vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 301 vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 302 vprod2x12 = vmlal_s8(vprod2x12, vget_high_s8(vb12), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 303 vprod3x12 = vmlal_s8(vprod3x12, vget_high_s8(vb12), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal()
|
D | 2x16c16-minmax-rndnu-neon-mlal.c | 101 const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() local 178 int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 179 int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 180 vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 181 vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal()
|
D | 1x16c16-minmax-rndnu-neon-mlal.c | 78 const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal() local 119 int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal() 120 vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal()
|
D | 1x16c8-minmax-rndnu-neon-mull.c | 103 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull() local 104 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull()
|
D | 2x16c8-minmax-rndnu-neon-mull.c | 150 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull() local 151 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull() 152 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull()
|
D | 4x16c8-minmax-rndnu-neon-mull.c | 244 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() local 245 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 246 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 247 const int16x8_t vprod2x12 = vmull_s8(vb12, va2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 248 const int16x8_t vprod3x12 = vmull_s8(vb12, va3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull()
|
D | 3x16c8-minmax-rndnu-neon-mull.c | 197 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() local 198 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 199 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 200 const int16x8_t vprod2x12 = vmull_s8(vb12, va2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull()
|
D | 3x16c8-minmax-rndnu-neon-mlal.c | 386 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local 387 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 388 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 389 const int16x8_t vprod2x12 = vmull_s8(vb12, va2); in xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
|
D | 2x16c8-minmax-rndnu-neon-mlal.c | 289 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() local 290 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 291 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal()
|
D | 1x16c8-minmax-rndnu-neon-mlal.c | 192 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal() local 193 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal()
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 483 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 484 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 485 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 486 const int16x8_t vprod2x12 = vmull_s8(vb12, va2); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 487 const int16x8_t vprod3x12 = vmull_s8(vb12, va3); in xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c16-minmax-rndnu-neon-mlal.c | 139 const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() local 252 int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 253 int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 254 int16x8_t vprod2x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 255 vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 256 vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal() 257 vprod2x12 = vmlal_s8(vprod2x12, vget_high_s8(vb12), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c16__neon_mlal()
|
D | 4x16c16-minmax-rndnu-neon-mlal.c | 164 const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() local 313 int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 314 int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 315 int16x8_t vprod2x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 316 int16x8_t vprod3x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 317 vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 318 vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 319 vprod2x12 = vmlal_s8(vprod2x12, vget_high_s8(vb12), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal() 320 vprod3x12 = vmlal_s8(vprod3x12, vget_high_s8(vb12), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c16__neon_mlal()
|
D | 2x16c16-minmax-rndnu-neon-mlal.c | 114 const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() local 191 int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 192 int16x8_t vprod1x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 193 vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal() 194 vprod1x12 = vmlal_s8(vprod1x12, vget_high_s8(vb12), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c16__neon_mlal()
|
D | 1x16c16-minmax-rndnu-neon-mlal.c | 89 const int8x16_t vb12 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal() local 130 int16x8_t vprod0x12 = vmull_s8(vget_low_s8(vb12), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal() 131 vprod0x12 = vmlal_s8(vprod0x12, vget_high_s8(vb12), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c16__neon_mlal()
|
D | 2x16c8-minmax-rndnu-neon-mull.c | 163 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull() local 164 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull() 165 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mull()
|
D | 4x16c8-minmax-rndnu-neon-mull.c | 261 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() local 262 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 263 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 264 const int16x8_t vprod2x12 = vmull_s8(vb12, va2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull() 265 const int16x8_t vprod3x12 = vmull_s8(vb12, va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mull()
|
D | 3x16c8-minmax-rndnu-neon-mull.c | 212 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() local 213 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 214 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull() 215 const int16x8_t vprod2x12 = vmull_s8(vb12, va2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mull()
|
D | 1x16c8-minmax-rndnu-neon-mull.c | 114 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull() local 115 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mull()
|
D | 3x16c8-minmax-rndnu-neon-mlal.c | 401 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() local 402 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 403 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal() 404 const int16x8_t vprod2x12 = vmull_s8(vb12, va2); in xnn_qs8_igemm_minmax_rndnu_ukernel_3x16c8__neon_mlal()
|
D | 2x16c8-minmax-rndnu-neon-mlal.c | 302 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() local 303 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal() 304 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x16c8__neon_mlal()
|
D | 1x16c8-minmax-rndnu-neon-mlal.c | 203 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal() local 204 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c8__neon_mlal()
|
D | 4x16c8-minmax-rndnu-neon-mlal.c | 500 const int8x8_t vb12 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() local 501 const int16x8_t vprod0x12 = vmull_s8(vb12, va0); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 502 const int16x8_t vprod1x12 = vmull_s8(vb12, va1); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 503 const int16x8_t vprod2x12 = vmull_s8(vb12, va2); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal() 504 const int16x8_t vprod3x12 = vmull_s8(vb12, va3); in xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c8__neon_mlal()
|