/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 121 const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 210 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 211 int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 212 int16x8_t vprod2x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 213 vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 214 vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 215 vprod2x9 = vmlal_s8(vprod2x9, vget_high_s8(vb9), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 144 const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 260 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 261 int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 262 int16x8_t vprod2x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 263 int16x8_t vprod3x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 264 vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 265 vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 266 vprod2x9 = vmlal_s8(vprod2x9, vget_high_s8(vb9), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 267 vprod3x9 = vmlal_s8(vprod3x9, vget_high_s8(vb9), vget_high_s8(va3)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 98 const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 160 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 161 int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 162 vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 163 vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 75 const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 110 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal() 111 vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 94 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal() local 95 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 135 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() local 136 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal() 137 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 217 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() local 218 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 219 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 220 const int16x8_t vprod2x9 = vmull_s8(vb9, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal() 221 const int16x8_t vprod3x9 = vmull_s8(vb9, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 176 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() local 177 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 178 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal() 179 const int16x8_t vprod2x9 = vmull_s8(vb9, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 183 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 184 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 274 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 275 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 276 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 365 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 366 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 367 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 368 const int16x8_t vprod2x9 = vmull_s8(vb9, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 456 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 457 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 458 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 459 const int16x8_t vprod2x9 = vmull_s8(vb9, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 460 const int16x8_t vprod3x9 = vmull_s8(vb9, va3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x16c16-minmax-neon-mlal-padal.c | 136 const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 225 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 226 int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 227 int16x8_t vprod2x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 228 vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 229 vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 230 vprod2x9 = vmlal_s8(vprod2x9, vget_high_s8(vb9), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 161 const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 277 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 278 int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 279 int16x8_t vprod2x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 280 int16x8_t vprod3x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 281 vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 282 vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 283 vprod2x9 = vmlal_s8(vprod2x9, vget_high_s8(vb9), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 284 vprod3x9 = vmlal_s8(vprod3x9, vget_high_s8(vb9), vget_high_s8(va3)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 111 const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 173 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 174 int16x8_t vprod1x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 175 vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal() 176 vprod1x9 = vmlal_s8(vprod1x9, vget_high_s8(vb9), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 1x16c16-minmax-neon-mlal-padal.c | 86 const int8x16_t vb9 = vld1q_s8(w); w = (const void*) ((uintptr_t) w + 16 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() local 121 int16x8_t vprod0x9 = vmull_s8(vget_low_s8(vb9), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal() 122 vprod0x9 = vmlal_s8(vprod0x9, vget_high_s8(vb9), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x16c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mull-padal.c | 148 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() local 149 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal() 150 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mull_padal()
|
D | 4x16c8-minmax-neon-mull-padal.c | 234 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() local 235 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 236 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 237 const int16x8_t vprod2x9 = vmull_s8(vb9, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal() 238 const int16x8_t vprod3x9 = vmull_s8(vb9, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mull-padal.c | 105 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal() local 106 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mull_padal()
|
D | 3x16c8-minmax-neon-mull-padal.c | 191 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() local 192 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 193 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal() 194 const int16x8_t vprod2x9 = vmull_s8(vb9, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 194 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 195 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 287 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 288 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 289 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 473 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 474 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 475 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 476 const int16x8_t vprod2x9 = vmull_s8(vb9, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 477 const int16x8_t vprod3x9 = vmull_s8(vb9, va3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 380 const int8x8_t vb9 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t)); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 381 const int16x8_t vprod0x9 = vmull_s8(vb9, va0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 382 const int16x8_t vprod1x9 = vmull_s8(vb9, va1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 383 const int16x8_t vprod2x9 = vmull_s8(vb9, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|