/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x8c8-minmax-neon-mlal-padal.c | 100 int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 102 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 104 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 161 const int16x8_t vprod1x2 = vmull_s8(vb2, va1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 163 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x4c8-xw-minmax-wasmsimd.c | 89 const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd() local 90 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd() 91 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd()
|
D | 2x4c8-minmax-wasmsimd-ld128.c | 92 const v128_t vprod1x2 = wasm_i16x8_mul(vxb2, vxa1); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128() local 93 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128() 100 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128()
|
D | 2x4c8-minmax-wasmsimd-ld64.c | 89 const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64() local 90 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64() 91 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 122 int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 125 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 128 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 206 const int16x8_t vprod1x2 = vmull_s8(vb2, va1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 209 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 95 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 97 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 99 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x4c8-xw-minmax-wasmsimd.c | 107 const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() local 108 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() 109 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd()
|
D | 3x4c8-minmax-wasmsimd-ld128.c | 110 const v128_t vprod1x2 = wasm_i16x8_mul(vxb2, vxa1); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 111 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 120 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 107 const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 108 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 109 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 116 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 119 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 122 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 144 int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 148 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 152 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 251 const int16x8_t vprod1x2 = vmull_s8(vb2, va1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 255 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 124 int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 126 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 128 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 241 const int16x8_t vprod1x2 = vmull_s8(vb2, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 243 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 119 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 121 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 123 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 137 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 141 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 145 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x8c8-minmax-neon-mlal-padal.c | 113 int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 115 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 117 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 174 const int16x8_t vprod1x2 = vmull_s8(vb2, va1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 176 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x4c8-minmax-wasmsimd-ld128.c | 105 const v128_t vprod1x2 = wasm_i16x8_mul(vxb2, vxa1); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128() local 106 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128() 113 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128()
|
D | 2x4c8-minmax-wasmsimd-ld64.c | 102 const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64() local 103 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64() 104 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 108 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 110 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 112 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x4c8-minmax-wasmsimd-ld128.c | 125 const v128_t vprod1x2 = wasm_i16x8_mul(vxb2, vxa1); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 126 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 135 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 122 const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 123 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_low_i16x8(vprod1x2)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 124 vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_widen_high_i16x8(vprod1x2)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 137 int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 140 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 143 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 221 const int16x8_t vprod1x2 = vmull_s8(vb2, va1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 224 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 131 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 134 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 137 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 137 int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 139 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 141 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 254 const int16x8_t vprod1x2 = vmull_s8(vb2, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 256 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 161 int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 165 vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 169 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 268 const int16x8_t vprod1x2 = vmull_s8(vb2, va1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 272 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 154 int16x8_t vprod1x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 158 vprod1x2 = vmlal_s8(vprod1x2, vget_high_s8(vb2), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 162 vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|