• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 
11 #include <arm_neon.h>
12 
13 #include <xnnpack/clamp.h>
14 
15 
xnn_u8_clamp_ukernel__neon_x64(size_t n,const uint8_t * x,uint8_t * y,const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_u8_clamp_ukernel__neon_x64(
17     size_t n,
18     const uint8_t* x,
19     uint8_t* y,
20     const union xnn_u8_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
21 {
22   assert(n != 0);
23 
24   const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.max);
25   const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.min);
26 
27   for (; n >= 64; n -= 64) {
28     const uint8x16_t vx0 = vld1q_u8(x); x += 16;
29     const uint8x16_t vx1 = vld1q_u8(x); x += 16;
30     const uint8x16_t vx2 = vld1q_u8(x); x += 16;
31     const uint8x16_t vx3 = vld1q_u8(x); x += 16;
32 
33     const uint8x16_t vy0 = vminq_u8(vmaxq_u8(vx0, voutput_min), voutput_max);
34     const uint8x16_t vy1 = vminq_u8(vmaxq_u8(vx1, voutput_min), voutput_max);
35     const uint8x16_t vy2 = vminq_u8(vmaxq_u8(vx2, voutput_min), voutput_max);
36     const uint8x16_t vy3 = vminq_u8(vmaxq_u8(vx3, voutput_min), voutput_max);
37 
38     vst1q_u8(y, vy0); y += 16;
39     vst1q_u8(y, vy1); y += 16;
40     vst1q_u8(y, vy2); y += 16;
41     vst1q_u8(y, vy3); y += 16;
42   }
43   for (; n >= 8; n -= 8) {
44     uint8x8_t vout = vld1_u8(x); x += 8;
45     vout = vmin_u8(vout, vget_low_u8(voutput_max));
46     vout = vmax_u8(vout, vget_low_u8(voutput_min));
47     vst1_u8(y, vout); y += 8;
48   }
49   if XNN_UNLIKELY(n != 0) {
50     uint8x8_t vout = vld1_u8(x);
51     vout = vmin_u8(vout, vget_low_u8(voutput_max));
52     vout = vmax_u8(vout, vget_low_u8(voutput_min));
53 
54     if (n & 4) {
55       vst1_lane_u32(__builtin_assume_aligned(y, 1), vreinterpret_u32_u8(vout), 0); y += 4;
56       vout = vext_u8(vout, vout, 4);
57     }
58     if (n & 2) {
59       vst1_lane_u16(__builtin_assume_aligned(y, 1), vreinterpret_u16_u8(vout), 0); y += 2;
60       vout = vext_u8(vout, vout, 2);
61     }
62     if (n & 1) {
63       vst1_lane_u8(y, vout, 0);
64     }
65   }
66 }
67