Home
last modified time | relevance | path

Searched refs:vacc0123 (Results 1 – 25 of 325) sorted by relevance

12345678910>>...13

/external/XNNPACK/src/f32-hswish/gen/
Dhswish-sse-x8.c37 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); in xnn_f32_hswish_ukernel__sse_x8() local
40 vacc0123 = _mm_add_ps(vacc0123, vhalf); in xnn_f32_hswish_ukernel__sse_x8()
43 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x8()
46 vacc0123 = _mm_min_ps(vacc0123, vone); in xnn_f32_hswish_ukernel__sse_x8()
49 vacc0123 = _mm_mul_ps(vacc0123, vx0123); in xnn_f32_hswish_ukernel__sse_x8()
52 _mm_storeu_ps(y, vacc0123); in xnn_f32_hswish_ukernel__sse_x8()
59 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); in xnn_f32_hswish_ukernel__sse_x8() local
60 vacc0123 = _mm_add_ps(vacc0123, vhalf); in xnn_f32_hswish_ukernel__sse_x8()
61 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x8()
62 vacc0123 = _mm_min_ps(vacc0123, vone); in xnn_f32_hswish_ukernel__sse_x8()
[all …]
Dhswish-sse-x4.c36 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); in xnn_f32_hswish_ukernel__sse_x4() local
38 vacc0123 = _mm_add_ps(vacc0123, vhalf); in xnn_f32_hswish_ukernel__sse_x4()
40 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x4()
42 vacc0123 = _mm_min_ps(vacc0123, vone); in xnn_f32_hswish_ukernel__sse_x4()
44 vacc0123 = _mm_mul_ps(vacc0123, vx0123); in xnn_f32_hswish_ukernel__sse_x4()
46 _mm_storeu_ps(y, vacc0123); in xnn_f32_hswish_ukernel__sse_x4()
51 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); in xnn_f32_hswish_ukernel__sse_x4() local
52 vacc0123 = _mm_add_ps(vacc0123, vhalf); in xnn_f32_hswish_ukernel__sse_x4()
53 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x4()
54 vacc0123 = _mm_min_ps(vacc0123, vone); in xnn_f32_hswish_ukernel__sse_x4()
[all …]
/external/XNNPACK/src/qs8-dwconv/gen/
Dup8x9-minmax-neon-mul16.c89 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() local
96 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
102 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
108 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
114 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
120 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
126 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
132 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
138 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
144 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16()
[all …]
Dup16x9-minmax-neon-mul16.c89 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() local
100 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
110 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
120 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
130 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
140 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
150 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
160 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
170 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
180 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16()
[all …]
Dup8x9-minmax-wasmsimd-mul16.c83 v128_t vacc0123 = wasm_v128_load(w); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() local
93 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
102 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
111 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
120 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
129 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
138 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
147 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
156 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
165 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16()
[all …]
Dup16x9-minmax-wasmsimd-mul16.c83 v128_t vacc0123 = wasm_v128_load(w); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() local
98 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
112 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
126 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
140 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
154 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
168 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
182 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
196 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
210 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16()
[all …]
Dup24x9-minmax-neon-mul16.c89 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() local
104 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
118 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
132 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
146 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
160 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
174 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
188 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
202 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
216 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16()
[all …]
Dup8x9-minmax-sse41-mul16.c83 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() local
97 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp0x01234567lo, vp0x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
110 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp1x01234567lo, vp1x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
123 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp2x01234567lo, vp2x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
136 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp3x01234567lo, vp3x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
149 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp4x01234567lo, vp4x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
162 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp5x01234567lo, vp5x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
175 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp6x01234567lo, vp6x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
188 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp7x01234567lo, vp7x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
201 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp8x01234567lo, vp8x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16()
[all …]
/external/XNNPACK/src/f32-hswish/
Dsse.c.in61 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); variable
62 vacc0123 = _mm_add_ps(vacc0123, vhalf);
63 vacc0123 = _mm_max_ps(vacc0123, vzero);
64 vacc0123 = _mm_min_ps(vacc0123, vone);
65 vacc0123 = _mm_mul_ps(vacc0123, vx0123);
66 _mm_storeu_ps(y, vacc0123);
71 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); variable
72 vacc0123 = _mm_add_ps(vacc0123, vhalf);
73 vacc0123 = _mm_max_ps(vacc0123, vzero);
74 vacc0123 = _mm_min_ps(vacc0123, vone);
[all …]
/external/XNNPACK/src/qs8-vadd/gen/
Dminmax-neon-ld64-x8.c41 int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() local
44 vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
47 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
50 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
53 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
71 int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() local
74 vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
77 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
80 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
83 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
Dminmax-xop-mul32-ld32-x8.c48 __m128i vacc0123 = _mm_macc_epi32(vx0123, vx_multiplier, vzero_point_product); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() local
51 vacc0123 = _mm_macc_epi32(vy0123, vy_multiplier, vacc0123); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
54 … vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
57vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
60 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
78 __m128i vacc0123 = _mm_macc_epi32(vx0123, vx_multiplier, vzero_point_product); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() local
81 vacc0123 = _mm_macc_epi32(vy0123, vy_multiplier, vacc0123); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
84 … vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
87vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
90 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
Dminmax-wasmsimd-x8.c40 …v128_t vacc0123 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() local
43vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy01234567), vy_mult… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
46 …nst v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vacc0123, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
49vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vacc0123, vshift), wasm_i32x4_gt(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
52 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
68 …v128_t vacc0123 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() local
71vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy01234567), vy_mult… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
74 …nst v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vacc0123, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
77vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vacc0123, vshift), wasm_i32x4_gt(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
80 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
Dminmax-neon-ld64-x16.c45 int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() local
50 vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
55 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
60 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
65 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
84 int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() local
87 vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
90 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
93 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
96 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
Dminmax-sse41-mul32-ld32-x8.c43 __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx0123, vx_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() local
46 vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vy0123, vy_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
49 … vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
52vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
55 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
73 __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx0123, vx_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() local
76 vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vy0123, vy_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
79 … vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
82vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
85 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
Dminmax-wasmsimd-x16.c42 …v128_t vacc0123 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() local
47vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy01234567), vy_mult… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
52 …nst v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vacc0123, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
57vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vacc0123, vshift), wasm_i32x4_gt(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
62 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
81 …v128_t vacc0123 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() local
84vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy01234567), vy_mult… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
87 …nst v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vacc0123, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
90vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vacc0123, vshift), wasm_i32x4_gt(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
93 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
/external/XNNPACK/src/f32-spmm/gen/
D32x1-minmax-wasmsimd-x86.c42 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() local
43 v128_t vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
44 v128_t vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
45 v128_t vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
46 v128_t vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
47 v128_t vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
48 v128_t vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
49 v128_t vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
63 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
73 v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86()
[all …]
D32x1-minmax-neon.c42 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__neon() local
43 float32x4_t vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon()
44 float32x4_t vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon()
45 float32x4_t vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon()
46 float32x4_t vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon()
47 float32x4_t vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon()
48 float32x4_t vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon()
49 float32x4_t vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon()
66 vacc0123 = vmlaq_f32(vacc0123, vi0123, vw); in xnn_f32_spmm_minmax_ukernel_32x1__neon()
76 float32x4_t vout0123 = vminq_f32(vacc0123, vmax); in xnn_f32_spmm_minmax_ukernel_32x1__neon()
[all …]
D32x1-minmax-neonfma.c42 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() local
43 float32x4_t vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
44 float32x4_t vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
45 float32x4_t vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
46 float32x4_t vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
47 float32x4_t vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
48 float32x4_t vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
49 float32x4_t vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
66 vacc0123 = vfmaq_f32(vacc0123, vi0123, vw); in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
76 float32x4_t vout0123 = vminq_f32(vacc0123, vmax); in xnn_f32_spmm_minmax_ukernel_32x1__neonfma()
[all …]
D32x1-minmax-wasmsimd-arm.c42 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() local
43 v128_t vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
44 v128_t vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
45 v128_t vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
46 v128_t vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
47 v128_t vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
48 v128_t vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
49 v128_t vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
63 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
73 v128_t vout0123 = wasm_f32x4_min(vacc0123, vmax); in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm()
[all …]
D32x1-minmax-sse.c42 __m128 vacc0123 = _mm_load1_ps(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__sse() local
43 __m128 vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse()
44 __m128 vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse()
45 __m128 vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse()
46 __m128 vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse()
47 __m128 vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse()
48 __m128 vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse()
49 __m128 vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse()
63 vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw)); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
73 __m128 vout0123 = _mm_min_ps(vacc0123, vmax); in xnn_f32_spmm_minmax_ukernel_32x1__sse()
[all …]
/external/XNNPACK/src/f32-dwconv/gen/
Dup8x4-minmax-wasmsimd-x86.c103 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() local
106 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86()
109 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86()
143 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() local
144 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86()
146 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86()
169 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() local
170 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86()
173 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86()
174 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86()
[all …]
Dup8x4-minmax-wasmsimd-x86-acc2.c106 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() local
109 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2()
112 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2()
148 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() local
149 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2()
151 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2()
176 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() local
177 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2()
180 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2()
181 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2()
[all …]
Dup4x4-minmax-wasmsimd-x86.c90 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() local
92 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86()
94 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86()
117 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() local
118 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86()
121 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86()
122 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86()
126 *output = wasm_f32x4_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86()
Dup4x4-minmax-wasmsimd-x86-acc2.c92 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() local
94 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2()
96 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2()
121 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() local
122 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2()
125 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2()
126 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2()
130 *output = wasm_f32x4_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2()
/external/XNNPACK/src/qs8-vaddc/gen/
Dminmax-neon-ld64-x8.c44 int32x4_t vacc0123 = vmlaq_s32(vy_bias, vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() local
47 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()
50 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()
53 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()
69 int32x4_t vacc0123 = vmlaq_s32(vy_bias, vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() local
72 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()
75 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()
78 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()

12345678910>>...13