/external/XNNPACK/src/f32-hswish/gen/ |
D | hswish-sse-x8.c | 37 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); in xnn_f32_hswish_ukernel__sse_x8() local 40 vacc0123 = _mm_add_ps(vacc0123, vhalf); in xnn_f32_hswish_ukernel__sse_x8() 43 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x8() 46 vacc0123 = _mm_min_ps(vacc0123, vone); in xnn_f32_hswish_ukernel__sse_x8() 49 vacc0123 = _mm_mul_ps(vacc0123, vx0123); in xnn_f32_hswish_ukernel__sse_x8() 52 _mm_storeu_ps(y, vacc0123); in xnn_f32_hswish_ukernel__sse_x8() 59 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); in xnn_f32_hswish_ukernel__sse_x8() local 60 vacc0123 = _mm_add_ps(vacc0123, vhalf); in xnn_f32_hswish_ukernel__sse_x8() 61 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x8() 62 vacc0123 = _mm_min_ps(vacc0123, vone); in xnn_f32_hswish_ukernel__sse_x8() [all …]
|
D | hswish-sse-x4.c | 36 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); in xnn_f32_hswish_ukernel__sse_x4() local 38 vacc0123 = _mm_add_ps(vacc0123, vhalf); in xnn_f32_hswish_ukernel__sse_x4() 40 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x4() 42 vacc0123 = _mm_min_ps(vacc0123, vone); in xnn_f32_hswish_ukernel__sse_x4() 44 vacc0123 = _mm_mul_ps(vacc0123, vx0123); in xnn_f32_hswish_ukernel__sse_x4() 46 _mm_storeu_ps(y, vacc0123); in xnn_f32_hswish_ukernel__sse_x4() 51 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); in xnn_f32_hswish_ukernel__sse_x4() local 52 vacc0123 = _mm_add_ps(vacc0123, vhalf); in xnn_f32_hswish_ukernel__sse_x4() 53 vacc0123 = _mm_max_ps(vacc0123, vzero); in xnn_f32_hswish_ukernel__sse_x4() 54 vacc0123 = _mm_min_ps(vacc0123, vone); in xnn_f32_hswish_ukernel__sse_x4() [all …]
|
/external/XNNPACK/src/qs8-dwconv/gen/ |
D | up8x9-minmax-neon-mul16.c | 89 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() local 96 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 102 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 108 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 114 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 120 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 126 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 132 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 138 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() 144 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16() [all …]
|
D | up16x9-minmax-neon-mul16.c | 89 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() local 100 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 110 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 120 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 130 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 140 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 150 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 160 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 170 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() 180 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__neon_mul16() [all …]
|
D | up8x9-minmax-wasmsimd-mul16.c | 83 v128_t vacc0123 = wasm_v128_load(w); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() local 93 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 102 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 111 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 120 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 129 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 138 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 147 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 156 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() 165 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16() [all …]
|
D | up16x9-minmax-wasmsimd-mul16.c | 83 v128_t vacc0123 = wasm_v128_load(w); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() local 98 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 112 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 126 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 140 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 154 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 168 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 182 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 196 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() 210 vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_widen_low_i16x8(vprod8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up16x9__wasmsimd_mul16() [all …]
|
D | up24x9-minmax-neon-mul16.c | 89 int32x4_t vacc0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() local 104 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi0x01234567), vget_low_s16(vk0x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 118 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi1x01234567), vget_low_s16(vk1x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 132 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi2x01234567), vget_low_s16(vk2x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 146 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi3x01234567), vget_low_s16(vk3x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 160 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi4x01234567), vget_low_s16(vk4x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 174 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi5x01234567), vget_low_s16(vk5x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 188 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi6x01234567), vget_low_s16(vk6x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 202 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi7x01234567), vget_low_s16(vk7x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() 216 vacc0123 = vmlal_s16(vacc0123, vget_low_s16(vi8x01234567), vget_low_s16(vk8x01234567)); in xnn_qs8_dwconv_minmax_ukernel_up24x9__neon_mul16() [all …]
|
D | up8x9-minmax-sse41-mul16.c | 83 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() local 97 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp0x01234567lo, vp0x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 110 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp1x01234567lo, vp1x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 123 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp2x01234567lo, vp2x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 136 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp3x01234567lo, vp3x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 149 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp4x01234567lo, vp4x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 162 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp5x01234567lo, vp5x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 175 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp6x01234567lo, vp6x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 188 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp7x01234567lo, vp7x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() 201 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vp8x01234567lo, vp8x01234567hi)); in xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16() [all …]
|
/external/XNNPACK/src/f32-hswish/ |
D | sse.c.in | 61 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); variable 62 vacc0123 = _mm_add_ps(vacc0123, vhalf); 63 vacc0123 = _mm_max_ps(vacc0123, vzero); 64 vacc0123 = _mm_min_ps(vacc0123, vone); 65 vacc0123 = _mm_mul_ps(vacc0123, vx0123); 66 _mm_storeu_ps(y, vacc0123); 71 __m128 vacc0123 = _mm_mul_ps(vx0123, vsixth); variable 72 vacc0123 = _mm_add_ps(vacc0123, vhalf); 73 vacc0123 = _mm_max_ps(vacc0123, vzero); 74 vacc0123 = _mm_min_ps(vacc0123, vone); [all …]
|
/external/XNNPACK/src/qs8-vadd/gen/ |
D | minmax-neon-ld64-x8.c | 41 int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() local 44 vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 47 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 50 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 53 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 71 int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() local 74 vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 77 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 80 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8() 83 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8()
|
D | minmax-xop-mul32-ld32-x8.c | 48 __m128i vacc0123 = _mm_macc_epi32(vx0123, vx_multiplier, vzero_point_product); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() local 51 vacc0123 = _mm_macc_epi32(vy0123, vy_multiplier, vacc0123); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 54 … vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 57 …vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 60 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 78 __m128i vacc0123 = _mm_macc_epi32(vx0123, vx_multiplier, vzero_point_product); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() local 81 vacc0123 = _mm_macc_epi32(vy0123, vy_multiplier, vacc0123); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 84 … vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 87 …vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8() 90 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8()
|
D | minmax-wasmsimd-x8.c | 40 …v128_t vacc0123 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() local 43 …vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy01234567), vy_mult… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 46 …nst v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vacc0123, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 49 …vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vacc0123, vshift), wasm_i32x4_gt(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 52 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 68 …v128_t vacc0123 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() local 71 …vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy01234567), vy_mult… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 74 …nst v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vacc0123, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 77 …vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vacc0123, vshift), wasm_i32x4_gt(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8() 80 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8()
|
D | minmax-neon-ld64-x16.c | 45 int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() local 50 vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 55 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 60 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 65 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 84 int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() local 87 vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vey01234567)), vy_multiplier); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 90 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 93 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16() 96 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16()
|
D | minmax-sse41-mul32-ld32-x8.c | 43 __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx0123, vx_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() local 46 vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vy0123, vy_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 49 … vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 52 …vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 55 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 73 __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_mullo_epi32(vx0123, vx_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() local 76 vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vy0123, vy_multiplier)); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 79 … vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 82 …vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8() 85 … __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point); in xnn_qs8_vadd_minmax_ukernel__sse41_mul32_ld32_x8()
|
D | minmax-wasmsimd-x16.c | 42 …v128_t vacc0123 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() local 47 …vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy01234567), vy_mult… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 52 …nst v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vacc0123, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 57 …vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vacc0123, vshift), wasm_i32x4_gt(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 62 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 81 …v128_t vacc0123 = wasm_i32x4_add(vzero_point_product, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vx… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() local 84 …vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_i32x4_widen_low_i16x8(vy01234567), vy_mult… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 87 …nst v128_t vrem0123 = wasm_i32x4_add(wasm_v128_and(vacc0123, vremainder_mask), wasm_i32x4_shr(vacc… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 90 …vacc0123 = wasm_i32x4_sub(wasm_i32x4_shr(vacc0123, vshift), wasm_i32x4_gt(vrem0123, vremainder_thr… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16() 93 …v128_t vout01234567 = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput… in xnn_qs8_vadd_minmax_ukernel__wasmsimd_x16()
|
/external/XNNPACK/src/f32-spmm/gen/ |
D | 32x1-minmax-wasmsimd-x86.c | 42 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() local 43 v128_t vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() 44 v128_t vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() 45 v128_t vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() 46 v128_t vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() 47 v128_t vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() 48 v128_t vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() 49 v128_t vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() 63 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() 73 v128_t vout0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86() [all …]
|
D | 32x1-minmax-neon.c | 42 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__neon() local 43 float32x4_t vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon() 44 float32x4_t vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon() 45 float32x4_t vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon() 46 float32x4_t vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon() 47 float32x4_t vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon() 48 float32x4_t vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon() 49 float32x4_t vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neon() 66 vacc0123 = vmlaq_f32(vacc0123, vi0123, vw); in xnn_f32_spmm_minmax_ukernel_32x1__neon() 76 float32x4_t vout0123 = vminq_f32(vacc0123, vmax); in xnn_f32_spmm_minmax_ukernel_32x1__neon() [all …]
|
D | 32x1-minmax-neonfma.c | 42 float32x4_t vacc0123 = vld1q_dup_f32(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() local 43 float32x4_t vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() 44 float32x4_t vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() 45 float32x4_t vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() 46 float32x4_t vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() 47 float32x4_t vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() 48 float32x4_t vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() 49 float32x4_t vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() 66 vacc0123 = vfmaq_f32(vacc0123, vi0123, vw); in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() 76 float32x4_t vout0123 = vminq_f32(vacc0123, vmax); in xnn_f32_spmm_minmax_ukernel_32x1__neonfma() [all …]
|
D | 32x1-minmax-wasmsimd-arm.c | 42 v128_t vacc0123 = wasm_v32x4_load_splat(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() local 43 v128_t vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() 44 v128_t vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() 45 v128_t vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() 46 v128_t vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() 47 v128_t vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() 48 v128_t vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() 49 v128_t vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() 63 vacc0123 = wasm_f32x4_add(vacc0123, wasm_f32x4_mul(vi0123, vw)); in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() 73 v128_t vout0123 = wasm_f32x4_min(vacc0123, vmax); in xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm() [all …]
|
D | 32x1-minmax-sse.c | 42 __m128 vacc0123 = _mm_load1_ps(w); w += 1; in xnn_f32_spmm_minmax_ukernel_32x1__sse() local 43 __m128 vacc4567 = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse() 44 __m128 vacc89AB = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse() 45 __m128 vaccCDEF = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse() 46 __m128 vaccGHIJ = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse() 47 __m128 vaccKLMN = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse() 48 __m128 vaccOPQR = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse() 49 __m128 vaccSTUV = vacc0123; in xnn_f32_spmm_minmax_ukernel_32x1__sse() 63 vacc0123 = _mm_add_ps(vacc0123, _mm_mul_ps(vi0123, vw)); in xnn_f32_spmm_minmax_ukernel_32x1__sse() 73 __m128 vout0123 = _mm_min_ps(vacc0123, vmax); in xnn_f32_spmm_minmax_ukernel_32x1__sse() [all …]
|
/external/XNNPACK/src/f32-dwconv/gen/ |
D | up8x4-minmax-wasmsimd-x86.c | 103 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() local 106 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() 109 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() 143 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() local 144 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() 146 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() 169 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() local 170 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() 173 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() 174 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86() [all …]
|
D | up8x4-minmax-wasmsimd-x86-acc2.c | 106 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() local 109 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() 112 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() 148 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() local 149 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() 151 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() 176 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() local 177 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() 180 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() 181 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); in xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86_acc2() [all …]
|
D | up4x4-minmax-wasmsimd-x86.c | 90 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() local 92 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() 94 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() 117 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() local 118 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() 121 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() 122 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86() 126 *output = wasm_f32x4_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86()
|
D | up4x4-minmax-wasmsimd-x86-acc2.c | 92 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() local 94 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() 96 wasm_v128_store(output, vacc0123); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() 121 v128_t vacc0123 = wasm_v128_bitselect(vmin, vacc0123p0, wasm_f32x4_lt(vacc0123p0, vmin)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() local 122 vacc0123 = wasm_v128_bitselect(vacc0123, vmax, wasm_f32x4_le(vacc0123, vmax)); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() 125 *((double*) output) = wasm_f64x2_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() 126 vacc0123 = wasm_v32x4_shuffle(vacc0123, vacc0123, 2, 3, 2, 3); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2() 130 *output = wasm_f32x4_extract_lane(vacc0123, 0); in xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_x86_acc2()
|
/external/XNNPACK/src/qs8-vaddc/gen/ |
D | minmax-neon-ld64-x8.c | 44 int32x4_t vacc0123 = vmlaq_s32(vy_bias, vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() local 47 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 50 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 53 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 69 int32x4_t vacc0123 = vmlaq_s32(vy_bias, vmovl_s16(vget_low_s16(vex01234567)), vx_multiplier); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() local 72 vacc0123 = vsraq_n_s32(vacc0123, vbicq_s32(vacc0123, vzero_shift_mask), 31); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 75 vacc0123 = vrshlq_s32(vacc0123, vright_shift); in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8() 78 …const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567))… in xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8()
|