/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x8c2s4-minmax-fp32-neon-mlal.c | 49 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() local 61 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 65 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 69 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 71 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 75 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 79 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 81 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 85 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 89 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() [all …]
|
D | 1x8c2s4-minmax-fp32-neonv8-mlal.c | 50 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() local 62 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 66 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 70 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 72 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 76 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 80 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 82 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 86 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 90 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() [all …]
|
D | 1x8c2s4-minmax-rndnu-neon-mlal.c | 49 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() local 61 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 65 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 69 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 71 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 75 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 79 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 81 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 85 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 89 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() [all …]
|
D | 1x16c2s4-minmax-rndnu-neon-mlal.c | 51 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 71 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 75 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 79 int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 83 int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 87 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 89 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 93 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 97 int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 101 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() [all …]
|
D | 1x16c2s4-minmax-rndnu-neon-mull.c | 51 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() local 70 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 72 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 74 int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 76 int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 78 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 79 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 81 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 83 int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 85 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() [all …]
|
D | 1x16c4s2-minmax-rndnu-neon-mlal.c | 55 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() local 75 int16x8_t vprod0x01c0 = vmull_s8(vb01c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() 79 int16x8_t vprod0x23c0 = vmull_s8(vb23c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() 83 int16x8_t vprod0x45c0 = vmull_s8(vb45c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() 87 int16x8_t vprod0x67c0 = vmull_s8(vb67c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() 91 int16x8_t vprod0x89c0 = vmull_s8(vb89c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() 95 int16x8_t vprod0xABc0 = vmull_s8(vbABc0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() 99 int16x8_t vprod0xCDc0 = vmull_s8(vbCDc0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() 103 int16x8_t vprod0xEFc0 = vmull_s8(vbEFc0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() 107 va0x0 = vext_s8(va0x0, va0x0, 4); in xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4s2__neon_mlal() [all …]
|
D | 2x8c2s4-minmax-rndnu-neon-mlal.c | 57 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() local 71 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 78 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 85 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 89 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 96 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 103 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 107 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 114 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 121 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() [all …]
|
D | 2x8c2s4-minmax-fp32-neonv8-mlal.c | 58 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() local 72 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 79 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 86 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 90 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 97 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 104 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 108 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 115 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 122 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() [all …]
|
D | 2x8c2s4-minmax-fp32-neon-mlal.c | 57 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() local 71 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 78 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 85 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 89 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 96 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 103 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 107 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 114 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 121 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() [all …]
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x8c2s4-minmax-fp32-neonv8-mlal.c | 61 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() local 73 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 77 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 81 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 83 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 87 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 91 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 93 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 97 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 101 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() [all …]
|
D | 1x8c2s4-minmax-rndnu-neon-mlal.c | 60 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() local 72 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 76 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 80 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 82 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 86 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 90 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 92 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 96 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() 100 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal() [all …]
|
D | 1x8c2s4-minmax-fp32-neon-mlal.c | 60 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() local 72 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 76 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 80 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 82 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 86 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 90 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 92 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 96 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 100 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() [all …]
|
D | 1x16c2s4-minmax-rndnu-neon-mlal.c | 62 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() local 82 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 86 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 90 int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 94 int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 98 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 100 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 104 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 108 int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() 112 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal() [all …]
|
D | 1x16c2s4-minmax-rndnu-neon-mull.c | 62 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() local 81 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 83 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 85 int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 87 int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 89 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 90 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 92 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 94 int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() 96 int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull() [all …]
|
D | 2x8c2s4-minmax-fp32-neonv8-mlal.c | 71 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() local 85 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 92 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 99 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 103 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 110 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 117 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 121 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 128 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 135 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() [all …]
|
D | 2x8c2s4-minmax-rndnu-neon-mlal.c | 70 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() local 84 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 91 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 98 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 102 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 109 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 116 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 120 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 127 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() 134 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal() [all …]
|
D | 2x8c2s4-minmax-fp32-neon-mlal.c | 70 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() local 84 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 91 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 98 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 102 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 109 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 116 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 120 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 127 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 134 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qs8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() [all …]
|
/external/XNNPACK/src/qc8-igemm/gen/ |
D | 1x8c2s4-minmax-fp32-neonv8-mlal.c | 61 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() local 73 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 77 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 81 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 83 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 87 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 91 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 93 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 97 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 101 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() [all …]
|
D | 1x8c2s4-minmax-fp32-neon-mlal.c | 60 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() local 72 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 76 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 80 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 82 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 86 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 90 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 92 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 96 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 100 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() [all …]
|
D | 2x8c2s4-minmax-fp32-neon-mlal.c | 70 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() local 84 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 91 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 98 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 102 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 109 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 116 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 120 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 127 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 134 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() [all …]
|
D | 2x8c2s4-minmax-fp32-neonv8-mlal.c | 71 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() local 85 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 92 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 99 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 103 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 110 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 117 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 121 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 128 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 135 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() [all …]
|
/external/XNNPACK/src/qc8-gemm/gen/ |
D | 1x8c2s4-minmax-fp32-neonv8-mlal.c | 50 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() local 62 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 66 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 70 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 72 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 76 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 80 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 82 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 86 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() 90 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal() [all …]
|
D | 1x8c2s4-minmax-fp32-neon-mlal.c | 49 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() local 61 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 65 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 69 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 71 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 75 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 79 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 81 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 85 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() 89 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal() [all …]
|
D | 2x8c2s4-minmax-fp32-neon-mlal.c | 57 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() local 71 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 78 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 85 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 89 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 96 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 103 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 107 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 114 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() 121 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal() [all …]
|
D | 2x8c2s4-minmax-fp32-neonv8-mlal.c | 58 int8x8_t va0x0 = vld1_s8(a0); a0 += 8; in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() local 72 int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 79 int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 86 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 90 int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 97 int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 104 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 108 int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 115 int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, va0x0); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() 122 va0x0 = vext_s8(va0x0, va0x0, 2); in xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal() [all …]
|