1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10
11 #include <emmintrin.h>
12
13 #include <xnnpack/clamp.h>
14
15
xnn_u8_clamp_ukernel__sse2_x64(size_t n,const uint8_t * x,uint8_t * y,const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_u8_clamp_ukernel__sse2_x64(
17 size_t n,
18 const uint8_t* x,
19 uint8_t* y,
20 const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
21 {
22 assert(n != 0);
23
24 const __m128i voutput_max = _mm_load_si128((const __m128i*) ¶ms->sse2.max);
25 const __m128i voutput_min = _mm_load_si128((const __m128i*) ¶ms->sse2.min);
26 for (; n >= 64; n -= 64) {
27 const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
28 const __m128i vx1 = _mm_loadu_si128((const __m128i*) x + 1);
29 const __m128i vx2 = _mm_loadu_si128((const __m128i*) x + 2);
30 const __m128i vx3 = _mm_loadu_si128((const __m128i*) x + 3);
31 x += 64;
32
33 const __m128i vy0 = _mm_min_epu8(_mm_max_epu8(vx0, voutput_min), voutput_max);
34 const __m128i vy1 = _mm_min_epu8(_mm_max_epu8(vx1, voutput_min), voutput_max);
35 const __m128i vy2 = _mm_min_epu8(_mm_max_epu8(vx2, voutput_min), voutput_max);
36 const __m128i vy3 = _mm_min_epu8(_mm_max_epu8(vx3, voutput_min), voutput_max);
37
38 _mm_storeu_si128((__m128i*) y, vy0);
39 _mm_storeu_si128((__m128i*) y + 1, vy1);
40 _mm_storeu_si128((__m128i*) y + 2, vy2);
41 _mm_storeu_si128((__m128i*) y + 3, vy3);
42 y += 64;
43 }
44 for (; n >= 8; n -= 8) {
45 __m128i vout = _mm_loadl_epi64((const __m128i*) x);
46 x += 8;
47 vout = _mm_min_epu8(vout, voutput_max);
48 vout = _mm_max_epu8(vout, voutput_min);
49 _mm_storel_epi64((__m128i*) y, vout);
50 y += 8;
51 }
52 if XNN_UNLIKELY(n != 0) {
53 __m128i vout = _mm_loadl_epi64((const __m128i*) x);
54 vout = _mm_min_epu8(vout, voutput_max);
55 vout = _mm_max_epu8(vout, voutput_min);
56 if (n & 4) {
57 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vout);
58 y += 4;
59 vout = _mm_srli_epi64(vout, 32);
60 }
61 if (n & 2) {
62 *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vout, 0);
63 y += 2;
64 vout = _mm_srli_epi32(vout, 16);
65 }
66 if (n & 1) {
67 *((uint8_t*) y) = (uint8_t) _mm_cvtsi128_si32(vout);
68 }
69 }
70 }
71