/external/XNNPACK/src/qs8-dwconv/gen/ |
D | up8x9-minmax-neon-mul16.c | 90 int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() local 97 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 103 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 109 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 115 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 121 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 127 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 133 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 139 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 145 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() [all …]
|
D | up16x9-minmax-neon-mul16.c | 90 int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() local 101 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 111 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 121 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 131 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 141 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 151 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 161 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 171 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 181 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() [all …]
|
D | up8x9-minmax-wasmsimd-mul16.c | 84 v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() local 94 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 103 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 112 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 121 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 130 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 139 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 148 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 157 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 166 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() [all …]
|
D | up16x9-minmax-wasmsimd-mul16.c | 84 v128_t vacc4567 = wasm_v128_load((const void*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() local 99 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 113 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 127 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 141 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 155 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 169 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 183 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 197 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 211 vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_widen_high_i16x8(vprod8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() [all …]
|
D | up24x9-minmax-neon-mul16.c | 90 int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() local 105 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 119 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 133 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 147 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 161 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 175 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 189 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 203 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 217 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() [all …]
|
D | up8x9-minmax-sse41-mul16.c | 84 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() local 98 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp0x01234567lo, vp0x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 111 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp1x01234567lo, vp1x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 124 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp2x01234567lo, vp2x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 137 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp3x01234567lo, vp3x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 150 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp4x01234567lo, vp4x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 163 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp5x01234567lo, vp5x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 176 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp6x01234567lo, vp6x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 189 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp7x01234567lo, vp7x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 202 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp8x01234567lo, vp8x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() [all …]
|
D | up32x9-minmax-neon-mul16.c | 90 int32x4_t vacc4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() local 109 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi0x01234567), vget_high_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 127 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi1x01234567), vget_high_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 145 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi2x01234567), vget_high_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 163 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi3x01234567), vget_high_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 181 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi4x01234567), vget_high_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 199 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi5x01234567), vget_high_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 217 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi6x01234567), vget_high_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 235 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi7x01234567), vget_high_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() 253 vacc4567 = vmlal_s16(vacc4567, vget_high_s16(vi8x01234567), vget_high_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up32x9__neon_mul16() [all …]
|
D | up8x9-minmax-sse2-mul16.c | 84 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() local 98 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp0x01234567lo, vp0x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() 111 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp1x01234567lo, vp1x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() 124 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp2x01234567lo, vp2x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() 137 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp3x01234567lo, vp3x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() 150 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp4x01234567lo, vp4x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() 163 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp5x01234567lo, vp5x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() 176 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp6x01234567lo, vp6x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() 189 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp7x01234567lo, vp7x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() 202 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp8x01234567lo, vp8x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16() [all …]
|
D | up8x9-minmax-ssse3-mul16.c | 84 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 4 * sizeof(int32_t))); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() local 98 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp0x01234567lo, vp0x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() 111 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp1x01234567lo, vp1x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() 124 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp2x01234567lo, vp2x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() 137 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp3x01234567lo, vp3x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() 150 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp4x01234567lo, vp4x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() 163 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp5x01234567lo, vp5x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() 176 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp6x01234567lo, vp6x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() 189 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp7x01234567lo, vp7x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() 202 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vp8x01234567lo, vp8x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16() [all …]
|
/external/XNNPACK/src/qs8-vadd/gen/ |
D | minmax-neon-ld64-x8.c | 42 int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() local 45 vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 48 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 51 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 53 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 72 int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() local 75 vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 78 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 81 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 83 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
|
D | minmax-xop-mul32-ld32-x8.c | 49 __m128i vacc4567 = _mm_macc_epi32(vx4567, vx_multiplier, vzero_point_product); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() local 52 vacc4567 = _mm_macc_epi32(vy4567, vy_multiplier, vacc4567); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 55 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 58 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 60 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 79 __m128i vacc4567 = _mm_macc_epi32(vx4567, vx_multiplier, vzero_point_product); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() local 82 vacc4567 = _mm_macc_epi32(vy4567, vy_multiplier, vacc4567); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 85 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 88 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 90 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
|
D | minmax-wasmsimd-x8.c | 41 …v128_t vacc4567 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(v… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() local 44 …vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vy01234567), vy_mul… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 47 …nst v128_t vrem4567 = wasm_i32x4_add(wasm_v128_and(vacc4567, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 50 …vacc4567 = wasm_i32x4_sub(wasm_i32x4_shr(vacc4567, vshift), wasm_i32x4_gt(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 52 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 69 …v128_t vacc4567 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(v… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() local 72 …vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vy01234567), vy_mul… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 75 …nst v128_t vrem4567 = wasm_i32x4_add(wasm_v128_and(vacc4567, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 78 …vacc4567 = wasm_i32x4_sub(wasm_i32x4_shr(vacc4567, vshift), wasm_i32x4_gt(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 80 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
|
D | minmax-neon-ld64-x16.c | 46 int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() local 51 vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 56 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 61 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 65 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 85 int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() local 88 vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 91 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 94 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 96 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
|
D | minmax-sse41-mul32-ld32-x8.c | 44 __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx4567, vx_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() local 47 vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vy4567, vy_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 50 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 53 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 55 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 74 __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx4567, vx_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() local 77 vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vy4567, vy_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 80 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 83 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 85 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
|
D | minmax-wasmsimd-x16.c | 43 …v128_t vacc4567 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(v… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() local 48 …vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vy01234567), vy_mul… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 53 …nst v128_t vrem4567 = wasm_i32x4_add(wasm_v128_and(vacc4567, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 58 …vacc4567 = wasm_i32x4_sub(wasm_i32x4_shr(vacc4567, vshift), wasm_i32x4_gt(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 62 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 82 …v128_t vacc4567 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(v… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() local 85 …vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(vy01234567), vy_mul… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 88 …nst v128_t vrem4567 = wasm_i32x4_add(wasm_v128_and(vacc4567, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 91 …vacc4567 = wasm_i32x4_sub(wasm_i32x4_shr(vacc4567, vshift), wasm_i32x4_gt(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 93 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
|
D | minmax-sse41-mul16-ld64-x8.c | 55 …__m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod0… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() local 58 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 61 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 64 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 66 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 95 …__m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod0… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() local 98 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 101 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 104 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8() 106 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8()
|
D | minmax-neon-ld64-x24.c | 50 int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() local 57 vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() 64 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() 71 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() 77 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() 102 int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() local 105 vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() 108 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() 111 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24() 113 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24()
|
D | minmax-xop-mul32-ld32-x16.c | 53 __m128i vacc4567 = _mm_macc_epi32(vx4567, vx_multiplier, vzero_point_product); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() local 58 vacc4567 = _mm_macc_epi32(vy4567, vy_multiplier, vacc4567); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() 63 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() 68 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() 72 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() 96 __m128i vacc4567 = _mm_macc_epi32(vx4567, vx_multiplier, vzero_point_product); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() local 99 vacc4567 = _mm_macc_epi32(vy4567, vy_multiplier, vacc4567); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() 102 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() 105 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16() 107 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x16()
|
D | minmax-sse2-mul16-ld64-x8.c | 57 …__m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod0… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() local 60 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 63 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 66 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 68 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 99 …__m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod0… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() local 102 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi)); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 105 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 108 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8() 110 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8()
|
D | minmax-sse41-mul32-ld32-x16.c | 48 __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx4567, vx_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() local 53 vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vy4567, vy_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() 58 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() 63 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() 67 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() 91 __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx4567, vx_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() local 94 vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vy4567, vy_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() 97 … vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() 100 …vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16() 102 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x16()
|
/external/XNNPACK/src/qs8-vaddc/gen/ |
D | minmax-neon-ld64-x8.c | 45 int32x4_t vacc4567 = vmlaq_s32(vy_bias, vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() local 48 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 51 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 53 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 70 int32x4_t vacc4567 = vmlaq_s32(vy_bias, vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() local 73 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 76 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 78 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()
|
D | minmax-neon-ld64-x16.c | 47 int32x4_t vacc4567 = vmlaq_s32(vy_bias, vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16() local 52 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16() 57 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16() 61 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16() 79 int32x4_t vacc4567 = vmlaq_s32(vy_bias, vmovl_s16(vget_high_s16(vex01234567)), vx_multiplier); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16() local 82 vacc4567 = vsraq_n_s32(vacc4567, vbicq_s32(vacc4567, vzero_shift_mask), 31); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16() 85 vacc4567 = vrshlq_s32(vacc4567, vright_shift); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16() 87 …vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_p… in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16()
|
D | minmax-wasmsimd-x8.c | 40 …v128_t vacc4567 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(v… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() local 43 …nst v128_t vrem4567 = wasm_i32x4_add(wasm_v128_and(vacc4567, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() 46 …vacc4567 = wasm_i32x4_sub(wasm_i32x4_shr(vacc4567, vshift), wasm_i32x4_gt(vrem4567, vremainder_thr… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() 48 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() 64 …v128_t vacc4567 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_high_i16x8(v… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() local 67 …nst v128_t vrem4567 = wasm_i32x4_add(wasm_v128_and(vacc4567, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() 70 …vacc4567 = wasm_i32x4_sub(wasm_i32x4_shr(vacc4567, vshift), wasm_i32x4_gt(vrem4567, vremainder_thr… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8() 72 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8()
|
/external/XNNPACK/src/f32-hswish/gen/ |
D | hswish-sse-x8.c | 38 __m128 vacc4567 = _mm_mul_ps(vx4567, vsixth); in xnn_f32_hswish_ukernel__sse_x8() local 41 vacc4567 = _mm_add_ps(vacc4567, vhalf); in xnn_f32_hswish_ukernel__sse_x8() 44 vacc4567 = _mm_max_ps(vacc4567, vzero); in xnn_f32_hswish_ukernel__sse_x8() 47 vacc4567 = _mm_min_ps(vacc4567, vone); in xnn_f32_hswish_ukernel__sse_x8() 50 vacc4567 = _mm_mul_ps(vacc4567, vx4567); in xnn_f32_hswish_ukernel__sse_x8() 53 _mm_storeu_ps(y + 4, vacc4567); in xnn_f32_hswish_ukernel__sse_x8()
|
/external/XNNPACK/src/qs8-gavgpool/gen/ |
D | 7p7x-minmax-neon-c8-acc2.c | 63 const int32x4_t vacc4567 = vaddw_s16(vbias, vget_high_s16(vacc0x01234567)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() local 66 vst1q_s32(b, vacc4567); b += 4; in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 100 int32x4_t vacc4567 = vld1q_s32(b + 4); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() local 103 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc0x01234567)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 106 vst1q_s32(b, vacc4567); b += 4; in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 165 int32x4_t vacc4567 = vld1q_s32(buffer); buffer += 4; in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() local 168 vacc4567 = vaddw_s16(vacc4567, vget_high_s16(vacc0x01234567)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 171 const int32x4_t vsgnacc4567 = vreinterpretq_s32_u32(vcltq_s32(vacc4567, vmovq_n_s32(0))); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 176 const int64x2_t vprod45 = vmull_s32(vget_low_s32(vacc4567), vget_low_s32(vmultiplier)); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() 177 const int64x2_t vprod67 = vmull_high_s32(vacc4567, vmultiplier); in xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2() [all …]
|