/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 123 const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 228 int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 229 int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 230 int16x8_t vprod2x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 231 vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 232 vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 233 vprod2x11 = vmlal_s8(vprod2x11, vget_high_s8(vb11), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 146 const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 284 int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 285 int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 286 int16x8_t vprod2x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 287 int16x8_t vprod3x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 288 vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 289 vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 290 vprod2x11 = vmlal_s8(vprod2x11, vget_high_s8(vb11), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 291 vprod3x11 = vmlal_s8(vprod3x11, vget_high_s8(vb11), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 100 const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 172 int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 173 int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 174 vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 175 vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 77 const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 116 int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 117 vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 100 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 101 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 145 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 146 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 147 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 235 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 236 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 237 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 238 const int16x8_t vprod2x11 = vmull_s8(vb11, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 239 const int16x8_t vprod3x11 = vmull_s8(vb11, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 190 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 191 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 192 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 193 const int16x8_t vprod2x11 = vmull_s8(vb11, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 189 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 190 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 284 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 285 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 286 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 379 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 380 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 381 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 382 const int16x8_t vprod2x11 = vmull_s8(vb11, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 474 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 475 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 476 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 477 const int16x8_t vprod2x11 = vmull_s8(vb11, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 478 const int16x8_t vprod3x11 = vmull_s8(vb11, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 138 const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 243 int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 244 int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 245 int16x8_t vprod2x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 246 vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 247 vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 248 vprod2x11 = vmlal_s8(vprod2x11, vget_high_s8(vb11), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 163 const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 301 int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 302 int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 303 int16x8_t vprod2x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 304 int16x8_t vprod3x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 305 vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 306 vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 307 vprod2x11 = vmlal_s8(vprod2x11, vget_high_s8(vb11), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 308 vprod3x11 = vmlal_s8(vprod3x11, vget_high_s8(vb11), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 113 const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 185 int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 186 int16x8_t vprod1x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 187 vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 188 vprod1x11 = vmlal_s8(vprod1x11, vget_high_s8(vb11), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 88 const int8x16_t vb11 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 127 int16x8_t vprod0x11 = vmull_s8(vget_low_s8(vb11), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 128 vprod0x11 = vmlal_s8(vprod0x11, vget_high_s8(vb11), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 158 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 159 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 160 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 252 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 253 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 254 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 255 const int16x8_t vprod2x11 = vmull_s8(vb11, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 256 const int16x8_t vprod3x11 = vmull_s8(vb11, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 111 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 112 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 205 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 206 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 207 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 208 const int16x8_t vprod2x11 = vmull_s8(vb11, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 200 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 201 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 297 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 298 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 299 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 491 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 492 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 493 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 494 const int16x8_t vprod2x11 = vmull_s8(vb11, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 495 const int16x8_t vprod3x11 = vmull_s8(vb11, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 394 const int8x8_t vb11 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 395 const int16x8_t vprod0x11 = vmull_s8(vb11, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 396 const int16x8_t vprod1x11 = vmull_s8(vb11, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 397 const int16x8_t vprod2x11 = vmull_s8(vb11, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|