/external/XNNPACK/src/qs8-gemm/gen/ |
D | 1x8c8-minmax-neon-mlal-padal.c | 73 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() local 74 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 75 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() 112 const int16x8_t vprod0x1 = vmull_s8(vb1, va0); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal() local 113 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal()
|
D | 1x4c8-minmax-wasmsimd-ld64.c | 63 const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1); in xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64() local 64 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64() 65 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64()
|
D | 1x4c8-minmax-wasmsimd-ld128.c | 63 const v128_t vprod0x1 = wasm_i16x8_mul(vxb1, vxa0); in xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128() local 64 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128() 67 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld128()
|
D | 1x4c8-xw-minmax-wasmsimd.c | 63 const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__wasmsimd() local 64 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__wasmsimd() 65 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_gemm_xw_minmax_ukernel_1x4c8__wasmsimd()
|
D | 1x8c16-minmax-neon-mlal-padal.c | 70 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() local 71 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal() 72 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_1x8c16__neon_mlal_padal()
|
D | 2x8c8-minmax-neon-mlal-padal.c | 92 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 94 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 96 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() 155 const int16x8_t vprod0x1 = vmull_s8(vb1, va0); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 157 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x4c8-xw-minmax-wasmsimd.c | 78 const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd() local 79 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd() 80 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_gemm_xw_minmax_ukernel_2x4c8__wasmsimd()
|
D | 2x4c8-minmax-wasmsimd-ld128.c | 77 const v128_t vprod0x1 = wasm_i16x8_mul(vxb1, vxa0); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128() local 78 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128() 84 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld128()
|
D | 2x4c8-minmax-wasmsimd-ld64.c | 78 const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64() local 79 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64() 80 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_2x4c8__wasmsimd_ld64()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 89 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 90 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 91 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() 160 const int16x8_t vprod0x1 = vmull_s8(vb1, va0); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 161 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 111 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 114 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 117 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 198 const int16x8_t vprod0x1 = vmull_s8(vb1, va0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 201 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 88 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 90 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal() 92 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_gemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x4c8-xw-minmax-wasmsimd.c | 93 const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() local 94 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() 95 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd()
|
D | 3x4c8-minmax-wasmsimd-ld128.c | 91 const v128_t vprod0x1 = wasm_i16x8_mul(vxb1, vxa0); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 92 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 101 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 1x8c8-minmax-neon-mlal-padal.c | 84 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() local 85 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 86 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() 123 const int16x8_t vprod0x1 = vmull_s8(vb1, va0); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal() local 124 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal()
|
D | 1x4c8-minmax-wasmsimd-ld64.c | 74 const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1); in xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64() local 75 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64() 76 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64()
|
D | 1x4c8-minmax-wasmsimd-ld128.c | 74 const v128_t vprod0x1 = wasm_i16x8_mul(vxb1, vxa0); in xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128() local 75 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128() 78 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld128()
|
D | 1x8c16-minmax-neon-mlal-padal.c | 81 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() local 82 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal() 83 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_igemm_minmax_ukernel_1x8c16__neon_mlal_padal()
|
D | 2x8c8-minmax-neon-mlal-padal.c | 105 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 107 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 109 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() 168 const int16x8_t vprod0x1 = vmull_s8(vb1, va0); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal() local 170 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c8__neon_mlal_padal()
|
D | 2x4c8-minmax-wasmsimd-ld128.c | 90 const v128_t vprod0x1 = wasm_i16x8_mul(vxb1, vxa0); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128() local 91 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128() 97 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld128()
|
D | 2x4c8-minmax-wasmsimd-ld64.c | 91 const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64() local 92 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64() 93 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_2x4c8__wasmsimd_ld64()
|
D | 1x16c8-minmax-neon-mlal-padal.c | 100 int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 101 vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 102 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() 171 const int16x8_t vprod0x1 = vmull_s8(vb1, va0); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal() local 172 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_igemm_minmax_ukernel_1x16c8__neon_mlal_padal()
|
D | 2x8c16-minmax-neon-mlal-padal.c | 101 int16x8_t vprod0x1 = vmull_s8(vget_low_s8(vb1), vget_low_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() local 103 vprod0x1 = vmlal_s8(vprod0x1, vget_high_s8(vb1), vget_high_s8(va0)); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal() 105 vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1); in xnn_qs8_igemm_minmax_ukernel_2x8c16__neon_mlal_padal()
|
D | 3x4c8-minmax-wasmsimd-ld128.c | 106 const v128_t vprod0x1 = wasm_i16x8_mul(vxb1, vxa0); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 107 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 116 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 108 const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 109 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_low_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 110 vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_widen_high_i16x8(vprod0x1)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|