• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 
11 #include <emmintrin.h>
12 
13 #include <xnnpack/clamp.h>
14 
15 
xnn_u8_clamp_ukernel__sse2_x64(size_t n,const uint8_t * x,uint8_t * y,const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_u8_clamp_ukernel__sse2_x64(
17     size_t n,
18     const uint8_t* x,
19     uint8_t* y,
20     const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
21 {
22   assert(n != 0);
23 
24   const __m128i voutput_max = _mm_load_si128((const __m128i*) &params->sse2.max);
25   const __m128i voutput_min = _mm_load_si128((const __m128i*) &params->sse2.min);
26   for (; n >= 64; n -= 64) {
27     const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
28     const __m128i vx1 = _mm_loadu_si128((const __m128i*) x + 1);
29     const __m128i vx2 = _mm_loadu_si128((const __m128i*) x + 2);
30     const __m128i vx3 = _mm_loadu_si128((const __m128i*) x + 3);
31     x += 64;
32 
33     const __m128i vy0 = _mm_min_epu8(_mm_max_epu8(vx0, voutput_min), voutput_max);
34     const __m128i vy1 = _mm_min_epu8(_mm_max_epu8(vx1, voutput_min), voutput_max);
35     const __m128i vy2 = _mm_min_epu8(_mm_max_epu8(vx2, voutput_min), voutput_max);
36     const __m128i vy3 = _mm_min_epu8(_mm_max_epu8(vx3, voutput_min), voutput_max);
37 
38     _mm_storeu_si128((__m128i*) y, vy0);
39     _mm_storeu_si128((__m128i*) y + 1, vy1);
40     _mm_storeu_si128((__m128i*) y + 2, vy2);
41     _mm_storeu_si128((__m128i*) y + 3, vy3);
42     y += 64;
43   }
44   for (; n >= 8; n -= 8) {
45     __m128i vout = _mm_loadl_epi64((const __m128i*) x);
46     x += 8;
47     vout = _mm_min_epu8(vout, voutput_max);
48     vout = _mm_max_epu8(vout, voutput_min);
49     _mm_storel_epi64((__m128i*) y, vout);
50     y += 8;
51   }
52   if XNN_UNLIKELY(n != 0) {
53     __m128i vout = _mm_loadl_epi64((const __m128i*) x);
54     vout = _mm_min_epu8(vout, voutput_max);
55     vout = _mm_max_epu8(vout, voutput_min);
56     if (n & 4) {
57       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vout);
58       y += 4;
59       vout = _mm_srli_epi64(vout, 32);
60     }
61     if (n & 2) {
62       *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vout, 0);
63       y += 2;
64       vout = _mm_srli_epi32(vout, 16);
65     }
66     if (n & 1) {
67       *((uint8_t*) y) = (uint8_t) _mm_cvtsi128_si32(vout);
68     }
69   }
70 }
71