• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert BATCH_TILE >= 16
7$assert BATCH_TILE % 16 == 0
8$SIMD_TILE = BATCH_TILE // 16
9$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
10#include <assert.h>
11
12#include <arm_neon.h>
13
14#include <xnnpack/intrinsics-polyfill.h>
15#include <xnnpack/lut.h>
16#include <xnnpack/common.h>
17
18
19void xnn_x8_lut_ukernel__neon_tbx128x4_x${BATCH_TILE}(
20    size_t n,
21    const uint8_t* x,
22    uint8_t* y,
23    const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
24{
25  assert(n != 0);
26  assert(x != NULL);
27  assert(y != NULL);
28
29  const uint8x16x4_t vtable0123 = vld1q_u8_x4(t);
30  const uint8x16x4_t vtable4567 = vld1q_u8_x4(t + 64);
31  const uint8x16x4_t vtable89AB = vld1q_u8_x4(t + 128);
32  const uint8x16x4_t vtableCDEF = vld1q_u8_x4(t + 192);
33  const uint8x16_t voffset = vmovq_n_u8(64);
34  $if BATCH_TILE > 16:
35    for (; n >= ${BATCH_TILE} * sizeof(uint8_t); n -= ${BATCH_TILE} * sizeof(uint8_t)) {
36      $for N in range(SIMD_TILE):
37        uint8x16_t vx${N} = vld1q_u8(x); x += 16;
38
39      $for N in range(SIMD_TILE):
40        uint8x16_t vy${N} = vqtbl4q_u8(vtable0123, vx${N});
41        vx${N} = vsubq_u8(vx${N}, voffset);
42
43      $for N in range(SIMD_TILE):
44        vy${N} = vqtbx4q_u8(vy${N}, vtable4567, vx${N});
45        vx${N} = vsubq_u8(vx${N}, voffset);
46
47      $for N in range(SIMD_TILE):
48        vy${N} = vqtbx4q_u8(vy${N}, vtable89AB, vx${N});
49        vx${N} = vsubq_u8(vx${N}, voffset);
50
51      $for N in range(SIMD_TILE):
52        vy${N} = vqtbx4q_u8(vy${N}, vtableCDEF, vx${N});
53
54      $for N in range(SIMD_TILE):
55        vst1q_u8(y, vy${N}); y += 16;
56    }
57  for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
58    uint8x16_t vx = vld1q_u8(x); x += 16;
59
60    uint8x16_t vy = vqtbl4q_u8(vtable0123, vx);
61
62    vx = vsubq_u8(vx, voffset);
63    vy = vqtbx4q_u8(vy, vtable4567, vx);
64
65    vx = vsubq_u8(vx, voffset);
66    vy = vqtbx4q_u8(vy, vtable89AB, vx);
67
68    vx = vsubq_u8(vx, voffset);
69    vy = vqtbx4q_u8(vy, vtableCDEF, vx);
70
71    vst1q_u8(y, vy); y += 16;
72  }
73  if XNN_UNLIKELY(n != 0) {
74    uint8x16_t vx = vld1q_u8(x);
75
76    uint8x16_t vy = vqtbl4q_u8(vtable0123, vx);
77
78    vx = vsubq_u8(vx, voffset);
79    vy = vqtbx4q_u8(vy, vtable4567, vx);
80
81    vx = vsubq_u8(vx, voffset);
82    vy = vqtbx4q_u8(vy, vtable89AB, vx);
83
84    vx = vsubq_u8(vx, voffset);
85    vy = vqtbx4q_u8(vy, vtableCDEF, vx);
86
87    uint8x8_t vy_lo = vget_low_u8(vy);
88    if (n & (8 * sizeof(uint8_t))) {
89      vst1_u8(y, vy_lo); y += 8;
90      vy_lo = vget_high_u8(vy);
91    }
92    if (n & (4 * sizeof(uint8_t))) {
93      vst1_lane_u32((void*) y, vreinterpret_u32_u8(vy_lo), 0); y += 4;
94      vy_lo = vext_u8(vy_lo, vy_lo, 4);
95    }
96    if (n & (2 * sizeof(uint8_t))) {
97      vst1_lane_u16((void*) y, vreinterpret_u16_u8(vy_lo), 0); y += 2;
98      vy_lo = vext_u8(vy_lo, vy_lo, 2);
99    }
100    if (n & (1 * sizeof(uint8_t))) {
101      vst1_lane_u8(y, vy_lo, 0);
102    }
103  }
104}
105