/external/XNNPACK/src/qs8-gemm/gen/ |
D | 2x8c8-minmax-neon-mlal-padal.c | 107 int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 109 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 111 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 166 const int16x8_t vprod1x3 = vmull_s8(vb3, va1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 168 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x4c8-xw-minmax-wasmsimd.c | 97 const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd() local 98 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd() 99 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd()
|
D | 2x4c8-minmax-wasmsimd-ld128.c | 98 const v128_t vprod1x3 = wasm_i16x8_mul(vxb3, vxa1); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128() local 99 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128() 103 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128()
|
D | 2x4c8-minmax-wasmsimd-ld64.c | 97 const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64() local 98 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64() 99 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 132 int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 135 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 138 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 213 const int16x8_t vprod1x3 = vmull_s8(vb3, va1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 216 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 101 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 103 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 105 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x4c8-xw-minmax-wasmsimd.c | 118 const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() local 119 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() 120 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd()
|
D | 3x4c8-minmax-wasmsimd-ld128.c | 118 const v128_t vprod1x3 = wasm_i16x8_mul(vxb3, vxa1); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 119 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 126 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 118 const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 119 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 120 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 125 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 128 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 131 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 157 int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 161 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 165 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 260 const int16x8_t vprod1x3 = vmull_s8(vb3, va1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 264 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 131 int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 133 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 135 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() 246 const int16x8_t vprod1x3 = vmull_s8(vb3, va1); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 248 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 2x16c16-minmax-neon-mlal-padal.c | 125 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() local 127 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal() 129 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_2x16c16__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 149 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 153 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 157 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 2x8c8-minmax-neon-mlal-padal.c | 120 int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 122 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 124 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 179 const int16x8_t vprod1x3 = vmull_s8(vb3, va1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 181 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x4c8-minmax-wasmsimd-ld128.c | 111 const v128_t vprod1x3 = wasm_i16x8_mul(vxb3, vxa1); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128() local 112 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128() 116 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128()
|
D | 2x4c8-minmax-wasmsimd-ld64.c | 110 const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64() local 111 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64() 112 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 114 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 116 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 118 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x4c8-minmax-wasmsimd-ld128.c | 133 const v128_t vprod1x3 = wasm_i16x8_mul(vxb3, vxa1); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 134 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 141 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 133 const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 134 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_low_i16x8(vprod1x3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 135 vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_widen_high_i16x8(vprod1x3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 147 int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 150 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 153 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 228 const int16x8_t vprod1x3 = vmull_s8(vb3, va1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 231 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 140 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 143 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 146 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 2x16c8-minmax-neon-mlal-padal.c | 144 int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 146 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 148 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() 259 const int16x8_t vprod1x3 = vmull_s8(vb3, va1); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal() local 261 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_2x16c8__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 174 int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 178 vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 182 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 277 const int16x8_t vprod1x3 = vmull_s8(vb3, va1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 281 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 166 int16x8_t vprod1x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 170 vprod1x3 = vmlal_s8(vprod1x3, vget_high_s8(vb3), vget_high_s8(va1)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 174 vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|