1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <emmintrin.h>
9
10 #include <xnnpack/vunary.h>
11
12
xnn_s8_vclamp_ukernel__sse2_x64(size_t n,const int8_t * x,int8_t * y,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])13 void xnn_s8_vclamp_ukernel__sse2_x64(
14 size_t n,
15 const int8_t* x,
16 int8_t* y,
17 const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
18 {
19 assert(n != 0);
20
21 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
22 const __m128i voutput_max_with_bias = _mm_load_si128((const __m128i*) params->sse2.max_with_bias);
23 const __m128i voutput_min_with_bias = _mm_load_si128((const __m128i*) params->sse2.min_with_bias);
24 for (; n >= 64; n -= 64) {
25 __m128i vacc0 = _mm_loadu_si128((const __m128i*) x);
26 __m128i vacc1 = _mm_loadu_si128((const __m128i*) x + 1);
27 __m128i vacc2 = _mm_loadu_si128((const __m128i*) x + 2);
28 __m128i vacc3 = _mm_loadu_si128((const __m128i*) x + 3);
29 x += 64;
30
31 vacc0 = _mm_xor_si128(vacc0, vbias);
32 vacc1 = _mm_xor_si128(vacc1, vbias);
33 vacc2 = _mm_xor_si128(vacc2, vbias);
34 vacc3 = _mm_xor_si128(vacc3, vbias);
35
36 vacc0 = _mm_max_epu8(vacc0, voutput_min_with_bias);
37 vacc1 = _mm_max_epu8(vacc1, voutput_min_with_bias);
38 vacc2 = _mm_max_epu8(vacc2, voutput_min_with_bias);
39 vacc3 = _mm_max_epu8(vacc3, voutput_min_with_bias);
40
41 vacc0 = _mm_min_epu8(vacc0, voutput_max_with_bias);
42 vacc1 = _mm_min_epu8(vacc1, voutput_max_with_bias);
43 vacc2 = _mm_min_epu8(vacc2, voutput_max_with_bias);
44 vacc3 = _mm_min_epu8(vacc3, voutput_max_with_bias);
45
46 vacc0 = _mm_xor_si128(vacc0, vbias);
47 vacc1 = _mm_xor_si128(vacc1, vbias);
48 vacc2 = _mm_xor_si128(vacc2, vbias);
49 vacc3 = _mm_xor_si128(vacc3, vbias);
50
51 _mm_storeu_si128((__m128i*) y, vacc0);
52 _mm_storeu_si128((__m128i*) y + 1, vacc1);
53 _mm_storeu_si128((__m128i*) y + 2, vacc2);
54 _mm_storeu_si128((__m128i*) y + 3, vacc3);
55 y += 64;
56 }
57 for (; n >= 16; n -= 16) {
58 __m128i vacc = _mm_loadu_si128((const __m128i*) x);
59 x += 16;
60
61 vacc = _mm_xor_si128(vacc, vbias);
62 vacc = _mm_min_epu8(vacc, voutput_max_with_bias);
63 vacc = _mm_max_epu8(vacc, voutput_min_with_bias);
64 vacc = _mm_xor_si128(vacc, vbias);
65
66 _mm_storeu_si128((__m128i*) y, vacc);
67 y += 16;
68 }
69 if XNN_UNLIKELY(n != 0) {
70 __m128i vacc = _mm_loadu_si128((const __m128i*) x);
71
72 vacc = _mm_xor_si128(vacc, vbias);
73 vacc = _mm_min_epu8(vacc, voutput_max_with_bias);
74 vacc = _mm_max_epu8(vacc, voutput_min_with_bias);
75 vacc = _mm_xor_si128(vacc, vbias);
76
77 if (n & 8) {
78 _mm_storel_epi64((__m128i*) y, vacc);
79 y += 8;
80 vacc = _mm_unpackhi_epi64(vacc, vacc);
81 }
82 if (n & 4) {
83 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vacc);
84 y += 4;
85 vacc = _mm_srli_epi64(vacc, 32);
86 }
87 if (n & 2) {
88 *((uint16_t*) y) = (uint16_t) _mm_cvtsi128_si32(vacc);
89 y += 2;
90 vacc = _mm_srli_epi32(vacc, 16);
91 }
92 if (n & 1) {
93 *y = (int8_t) _mm_cvtsi128_si32(vacc);
94 }
95 }
96 }
97