• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <emmintrin.h>
9 
10 #include <xnnpack/vunary.h>
11 
12 
xnn_s8_vclamp_ukernel__sse2_x64(size_t n,const int8_t * x,int8_t * y,const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])13 void xnn_s8_vclamp_ukernel__sse2_x64(
14     size_t n,
15     const int8_t* x,
16     int8_t* y,
17     const union xnn_s8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
18 {
19   assert(n != 0);
20 
21   const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
22   const __m128i voutput_max_with_bias = _mm_load_si128((const __m128i*) params->sse2.max_with_bias);
23   const __m128i voutput_min_with_bias = _mm_load_si128((const __m128i*) params->sse2.min_with_bias);
24   for (; n >= 64; n -= 64) {
25     __m128i vacc0 = _mm_loadu_si128((const __m128i*) x);
26     __m128i vacc1 = _mm_loadu_si128((const __m128i*) x + 1);
27     __m128i vacc2 = _mm_loadu_si128((const __m128i*) x + 2);
28     __m128i vacc3 = _mm_loadu_si128((const __m128i*) x + 3);
29     x += 64;
30 
31     vacc0 = _mm_xor_si128(vacc0, vbias);
32     vacc1 = _mm_xor_si128(vacc1, vbias);
33     vacc2 = _mm_xor_si128(vacc2, vbias);
34     vacc3 = _mm_xor_si128(vacc3, vbias);
35 
36     vacc0 = _mm_max_epu8(vacc0, voutput_min_with_bias);
37     vacc1 = _mm_max_epu8(vacc1, voutput_min_with_bias);
38     vacc2 = _mm_max_epu8(vacc2, voutput_min_with_bias);
39     vacc3 = _mm_max_epu8(vacc3, voutput_min_with_bias);
40 
41     vacc0 = _mm_min_epu8(vacc0, voutput_max_with_bias);
42     vacc1 = _mm_min_epu8(vacc1, voutput_max_with_bias);
43     vacc2 = _mm_min_epu8(vacc2, voutput_max_with_bias);
44     vacc3 = _mm_min_epu8(vacc3, voutput_max_with_bias);
45 
46     vacc0 = _mm_xor_si128(vacc0, vbias);
47     vacc1 = _mm_xor_si128(vacc1, vbias);
48     vacc2 = _mm_xor_si128(vacc2, vbias);
49     vacc3 = _mm_xor_si128(vacc3, vbias);
50 
51     _mm_storeu_si128((__m128i*) y, vacc0);
52     _mm_storeu_si128((__m128i*) y + 1, vacc1);
53     _mm_storeu_si128((__m128i*) y + 2, vacc2);
54     _mm_storeu_si128((__m128i*) y + 3, vacc3);
55     y += 64;
56   }
57   for (; n >= 16; n -= 16) {
58     __m128i vacc = _mm_loadu_si128((const __m128i*) x);
59     x += 16;
60 
61     vacc = _mm_xor_si128(vacc, vbias);
62     vacc = _mm_min_epu8(vacc, voutput_max_with_bias);
63     vacc = _mm_max_epu8(vacc, voutput_min_with_bias);
64     vacc = _mm_xor_si128(vacc, vbias);
65 
66     _mm_storeu_si128((__m128i*) y, vacc);
67     y += 16;
68   }
69   if XNN_UNLIKELY(n != 0) {
70     __m128i vacc = _mm_loadu_si128((const __m128i*) x);
71 
72     vacc = _mm_xor_si128(vacc, vbias);
73     vacc = _mm_min_epu8(vacc, voutput_max_with_bias);
74     vacc = _mm_max_epu8(vacc, voutput_min_with_bias);
75     vacc = _mm_xor_si128(vacc, vbias);
76 
77     if (n & 8) {
78       _mm_storel_epi64((__m128i*) y, vacc);
79       y += 8;
80       vacc = _mm_unpackhi_epi64(vacc, vacc);
81     }
82     if (n & 4) {
83       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vacc);
84       y += 4;
85       vacc = _mm_srli_epi64(vacc, 32);
86     }
87     if (n & 2) {
88       *((uint16_t*) y) = (uint16_t) _mm_cvtsi128_si32(vacc);
89       y += 2;
90       vacc = _mm_srli_epi32(vacc, 16);
91     }
92     if (n & 1) {
93       *y = (int8_t) _mm_cvtsi128_si32(vacc);
94     }
95   }
96 }
97