1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <assert.h>
10
11 #include <emmintrin.h>
12
13 #include <xnnpack/clamp.h>
14
15
xnn_u8_clamp_ukernel__sse2(size_t n,const uint8_t * x,uint8_t * y,const union xnn_u8_output_params params[restrict static1])16 void xnn_u8_clamp_ukernel__sse2(
17 size_t n,
18 const uint8_t* x,
19 uint8_t* y,
20 const union xnn_u8_output_params params[restrict static 1])
21 {
22 assert(n != 0);
23
24 const __m128i voutput_max = _mm_load_si128((const __m128i*) ¶ms->sse2.max);
25 const __m128i voutput_min = _mm_load_si128((const __m128i*) ¶ms->sse2.min);
26 for (; n >= 64; n -= 64) {
27 const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
28 const __m128i vx1 = _mm_loadu_si128((const __m128i*) x + 1);
29 const __m128i vx2 = _mm_loadu_si128((const __m128i*) x + 2);
30 const __m128i vx3 = _mm_loadu_si128((const __m128i*) x + 3);
31 x += 64;
32
33 const __m128i vy0 = _mm_min_epu8(_mm_max_epu8(vx0, voutput_min), voutput_max);
34 const __m128i vy1 = _mm_min_epu8(_mm_max_epu8(vx1, voutput_min), voutput_max);
35 const __m128i vy2 = _mm_min_epu8(_mm_max_epu8(vx2, voutput_min), voutput_max);
36 const __m128i vy3 = _mm_min_epu8(_mm_max_epu8(vx3, voutput_min), voutput_max);
37
38 __builtin_prefetch(x + 640);
39
40 _mm_storeu_si128((__m128i*) y, vy0);
41 _mm_storeu_si128((__m128i*) y + 1, vy1);
42 _mm_storeu_si128((__m128i*) y + 2, vy2);
43 _mm_storeu_si128((__m128i*) y + 3, vy3);
44 y += 64;
45 }
46 for (; n >= 8; n -= 8) {
47 __m128i vout = _mm_loadl_epi64((const __m128i*) x);
48 x += 8;
49 vout = _mm_min_epu8(vout, voutput_max);
50 vout = _mm_max_epu8(vout, voutput_min);
51 _mm_storel_epi64((__m128i*) y, vout);
52 y += 8;
53 }
54 if XNN_UNLIKELY(n != 0) {
55 __m128i vout = _mm_loadl_epi64((const __m128i*) x);
56 vout = _mm_min_epu8(vout, voutput_max);
57 vout = _mm_max_epu8(vout, voutput_min);
58 if (n & 4) {
59 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vout);
60 y += 4;
61 vout = _mm_srli_epi64(vout, 32);
62 }
63 if (n & 2) {
64 *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vout, 0);
65 y += 2;
66 vout = _mm_srli_epi32(vout, 16);
67 }
68 if (n & 1) {
69 *((uint8_t*) y) = (uint8_t) _mm_cvtsi128_si32(vout);
70 }
71 }
72 }
73