• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 
11 #include <emmintrin.h>
12 
13 #include <xnnpack/clamp.h>
14 
15 
xnn_u8_clamp_ukernel__sse2(size_t n,const uint8_t * x,uint8_t * y,const union xnn_u8_output_params params[restrict static1])16 void xnn_u8_clamp_ukernel__sse2(
17     size_t n,
18     const uint8_t* x,
19     uint8_t* y,
20     const union xnn_u8_output_params params[restrict static 1])
21 {
22   assert(n != 0);
23 
24   const __m128i voutput_max = _mm_load_si128((const __m128i*) &params->sse2.max);
25   const __m128i voutput_min = _mm_load_si128((const __m128i*) &params->sse2.min);
26   for (; n >= 64; n -= 64) {
27     const __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
28     const __m128i vx1 = _mm_loadu_si128((const __m128i*) x + 1);
29     const __m128i vx2 = _mm_loadu_si128((const __m128i*) x + 2);
30     const __m128i vx3 = _mm_loadu_si128((const __m128i*) x + 3);
31     x += 64;
32 
33     const __m128i vy0 = _mm_min_epu8(_mm_max_epu8(vx0, voutput_min), voutput_max);
34     const __m128i vy1 = _mm_min_epu8(_mm_max_epu8(vx1, voutput_min), voutput_max);
35     const __m128i vy2 = _mm_min_epu8(_mm_max_epu8(vx2, voutput_min), voutput_max);
36     const __m128i vy3 = _mm_min_epu8(_mm_max_epu8(vx3, voutput_min), voutput_max);
37 
38     __builtin_prefetch(x + 640);
39 
40     _mm_storeu_si128((__m128i*) y, vy0);
41     _mm_storeu_si128((__m128i*) y + 1, vy1);
42     _mm_storeu_si128((__m128i*) y + 2, vy2);
43     _mm_storeu_si128((__m128i*) y + 3, vy3);
44     y += 64;
45   }
46   for (; n >= 8; n -= 8) {
47     __m128i vout = _mm_loadl_epi64((const __m128i*) x);
48     x += 8;
49     vout = _mm_min_epu8(vout, voutput_max);
50     vout = _mm_max_epu8(vout, voutput_min);
51     _mm_storel_epi64((__m128i*) y, vout);
52     y += 8;
53   }
54   if XNN_UNLIKELY(n != 0) {
55     __m128i vout = _mm_loadl_epi64((const __m128i*) x);
56     vout = _mm_min_epu8(vout, voutput_max);
57     vout = _mm_max_epu8(vout, voutput_min);
58     if (n & 4) {
59       *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vout);
60       y += 4;
61       vout = _mm_srli_epi64(vout, 32);
62     }
63     if (n & 2) {
64       *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vout, 0);
65       y += 2;
66       vout = _mm_srli_epi32(vout, 16);
67     }
68     if (n & 1) {
69       *((uint8_t*) y) = (uint8_t) _mm_cvtsi128_si32(vout);
70     }
71   }
72 }
73