/external/XNNPACK/src/qs8-gemm/gen/ |
D | 3x8c8-minmax-neon-mlal-padal.c | 133 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 136 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 139 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() 214 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 217 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x4c8-xw-minmax-wasmsimd.c | 121 const v128_t vprod2x3 = wasm_i16x8_mul(vxa2, vxb3); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() local 122 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_low_i16x8(vprod2x3)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd() 123 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_high_i16x8(vprod2x3)); in xnn_qs8_gemm_xw_minmax_ukernel_3x4c8__wasmsimd()
|
D | 3x4c8-minmax-wasmsimd-ld128.c | 121 const v128_t vprod2x3 = wasm_i16x8_mul(vxb3, vxa2); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 122 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_low_i16x8(vprod2x3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 127 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_high_i16x8(vprod2x3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 121 const v128_t vprod2x3 = wasm_i16x8_mul(vxa2, vxb3); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 122 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_low_i16x8(vprod2x3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 123 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_high_i16x8(vprod2x3)); in xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 126 int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 129 vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal() 132 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 158 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 162 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 166 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() 261 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 265 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 150 int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 154 vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal() 158 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 165 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 168 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 171 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() 326 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 329 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 158 int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 161 vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal() 164 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 198 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 202 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 206 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() 405 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 409 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 113 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal() local 116 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 134 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal() local 138 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 190 int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 194 vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2)); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal() 198 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_gemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|
/external/XNNPACK/src/qs8-igemm/gen/ |
D | 3x4c8-minmax-wasmsimd-ld128.c | 136 const v128_t vprod2x3 = wasm_i16x8_mul(vxb3, vxa2); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() local 137 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_low_i16x8(vprod2x3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128() 142 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_high_i16x8(vprod2x3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld128()
|
D | 3x4c8-minmax-wasmsimd-ld64.c | 136 const v128_t vprod2x3 = wasm_i16x8_mul(vxa2, vxb3); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() local 137 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_low_i16x8(vprod2x3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64() 138 vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_widen_high_i16x8(vprod2x3)); in xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64()
|
D | 3x8c8-minmax-neon-mlal-padal.c | 148 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 151 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 154 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() 229 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal() local 232 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mlal_padal()
|
D | 3x8c16-minmax-neon-mlal-padal.c | 141 int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() local 144 vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal() 147 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_3x8c16__neon_mlal_padal()
|
D | 4x8c8-minmax-neon-mlal-padal.c | 175 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 179 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 183 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() 278 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal() local 282 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mlal_padal()
|
D | 4x8c16-minmax-neon-mlal-padal.c | 167 int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() local 171 vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal() 175 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_4x8c16__neon_mlal_padal()
|
D | 3x16c8-minmax-neon-mlal-padal.c | 180 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 183 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 186 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() 341 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal() local 344 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mlal_padal()
|
D | 3x16c16-minmax-neon-mlal-padal.c | 173 int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() local 176 vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal() 179 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_3x16c16__neon_mlal_padal()
|
D | 4x16c8-minmax-neon-mlal-padal.c | 215 int16x8_t vprod2x3 = vmull_s8(vb3x0, va2x0); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 219 vprod2x3 = vmlal_s8(vprod2x3, vb3x1, va2x1); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 223 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() 422 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal() local 426 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mlal_padal()
|
D | 3x8c8-minmax-neon-mull-padal.c | 128 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal() local 131 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_3x8c8__neon_mull_padal()
|
D | 4x8c8-minmax-neon-mull-padal.c | 151 const int16x8_t vprod2x3 = vmull_s8(vb3, va2); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal() local 155 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_4x8c8__neon_mull_padal()
|
D | 4x16c16-minmax-neon-mlal-padal.c | 207 int16x8_t vprod2x3 = vmull_s8(vget_low_s8(vb3), vget_low_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() local 211 vprod2x3 = vmlal_s8(vprod2x3, vget_high_s8(vb3), vget_high_s8(va2)); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal() 215 vacc2x3 = vpadalq_s16(vacc2x3, vprod2x3); in xnn_qs8_igemm_minmax_ukernel_4x16c16__neon_mlal_padal()
|