/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c8-minmax-neon-mlal-padal.c | 123 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 126 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 129 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 207 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 210 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x4c8-xw-minmax-wasmsimd.c | 110 const v128_t vprod2x2 = wasm_i16x8_mul(vxa2, vxb2); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() local 111 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_low_i16x8(vprod2x2)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() 112 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_high_i16x8(vprod2x2)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd()
|
D | 3x4c8-minmax-wasmsimd-ld128.c | 112 const v128_t vprod2x2 = wasm_i16x8_mul(vxb2, vxa2); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 113 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_low_i16x8(vprod2x2)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 123 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_high_i16x8(vprod2x2)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 110 const v128_t vprod2x2 = wasm_i16x8_mul(vxa2, vxb2); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 111 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_low_i16x8(vprod2x2)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 112 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_high_i16x8(vprod2x2)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 117 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 120 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 123 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 145 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 149 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 153 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 252 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 256 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 138 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 142 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 146 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 155 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 158 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 161 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 319 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 322 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 149 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 152 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 155 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 185 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 189 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 193 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 396 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 400 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 106 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local 109 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 125 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() local 129 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 178 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 182 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 186 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x4c8-minmax-wasmsimd-ld128.c | 127 const v128_t vprod2x2 = wasm_i16x8_mul(vxb2, vxa2); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 128 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_low_i16x8(vprod2x2)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 138 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_high_i16x8(vprod2x2)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 125 const v128_t vprod2x2 = wasm_i16x8_mul(vxa2, vxb2); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 126 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_low_i16x8(vprod2x2)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 127 vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_widen_high_i16x8(vprod2x2)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 138 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 141 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 144 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 222 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 225 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 132 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 135 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 138 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 162 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 166 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 170 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 269 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 273 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 155 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 159 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 163 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 170 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 173 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 176 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 334 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 337 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 164 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 167 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 170 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 202 int16x8_t vprod2x2 = vmull_s8(vb2x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 206 vprod2x2 = vmlal_s8(vprod2x2, vb2x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 210 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 413 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 417 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 121 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 124 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 142 const int16x8_t vprod2x2 = vmull_s8(vb2, va2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() local 146 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 195 int16x8_t vprod2x2 = vmull_s8(vget_low_s8(vb2), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 199 vprod2x2 = vmlal_s8(vprod2x2, vget_high_s8(vb2), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 203 vacc2x2 = vpadalq_s16(vacc2x2, vprod2x2); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|