1 // Auto-generated file. Do not edit!
2 // Template: src/x8-lut/ssse3.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <tmmintrin.h>
13
14 #include <xnnpack/lut.h>
15 #include <xnnpack/common.h>
16
17
xnn_x8_lut_ukernel__ssse3_x32(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])18 void xnn_x8_lut_ukernel__ssse3_x32(
19 size_t n,
20 const uint8_t* x,
21 uint8_t* y,
22 const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
23 {
24 assert(n != 0);
25 assert(x != NULL);
26 assert(y != NULL);
27
28 const __m128i vt0 = _mm_load_si128((const __m128i*) t);
29 const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
30 const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
31 const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
32 const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
33 const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
34 const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
35 const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
36 const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
37 const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
38 const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
39 const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
40 const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
41 const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
42 const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
43 const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
44
45 const __m128i vtable0 = vt0;
46 const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
47 const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
48 const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
49 const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
50 const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
51 const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
52 const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
53 const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
54 const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
55 const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
56 const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
57 const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
58 const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
59 const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
60 const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
61
62 const __m128i voffset = _mm_set1_epi8(16);
63 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
64 __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
65 __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
66 x += 32;
67
68 __m128i vy0 = _mm_shuffle_epi8(vtable0, vx0);
69 __m128i vy1 = _mm_shuffle_epi8(vtable0, vx1);
70
71 vx0 = _mm_sub_epi8(vx0, voffset);
72 vx1 = _mm_sub_epi8(vx1, voffset);
73 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable1, vx0));
74 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable1, vx1));
75 vx0 = _mm_sub_epi8(vx0, voffset);
76 vx1 = _mm_sub_epi8(vx1, voffset);
77 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable2, vx0));
78 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable2, vx1));
79 vx0 = _mm_sub_epi8(vx0, voffset);
80 vx1 = _mm_sub_epi8(vx1, voffset);
81 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable3, vx0));
82 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable3, vx1));
83 vx0 = _mm_sub_epi8(vx0, voffset);
84 vx1 = _mm_sub_epi8(vx1, voffset);
85 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable4, vx0));
86 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable4, vx1));
87 vx0 = _mm_sub_epi8(vx0, voffset);
88 vx1 = _mm_sub_epi8(vx1, voffset);
89 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable5, vx0));
90 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable5, vx1));
91 vx0 = _mm_sub_epi8(vx0, voffset);
92 vx1 = _mm_sub_epi8(vx1, voffset);
93 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable6, vx0));
94 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable6, vx1));
95 vx0 = _mm_sub_epi8(vx0, voffset);
96 vx1 = _mm_sub_epi8(vx1, voffset);
97 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable7, vx0));
98 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable7, vx1));
99 vx0 = _mm_sub_epi8(vx0, voffset);
100 vx1 = _mm_sub_epi8(vx1, voffset);
101 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable8, vx0));
102 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable8, vx1));
103
104 vx0 = _mm_subs_epi8(vx0, voffset);
105 vx1 = _mm_subs_epi8(vx1, voffset);
106 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable9, vx0));
107 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable9, vx1));
108 vx0 = _mm_subs_epi8(vx0, voffset);
109 vx1 = _mm_subs_epi8(vx1, voffset);
110 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableA, vx0));
111 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableA, vx1));
112 vx0 = _mm_subs_epi8(vx0, voffset);
113 vx1 = _mm_subs_epi8(vx1, voffset);
114 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableB, vx0));
115 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableB, vx1));
116 vx0 = _mm_subs_epi8(vx0, voffset);
117 vx1 = _mm_subs_epi8(vx1, voffset);
118 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableC, vx0));
119 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableC, vx1));
120 vx0 = _mm_subs_epi8(vx0, voffset);
121 vx1 = _mm_subs_epi8(vx1, voffset);
122 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableD, vx0));
123 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableD, vx1));
124 vx0 = _mm_subs_epi8(vx0, voffset);
125 vx1 = _mm_subs_epi8(vx1, voffset);
126 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableE, vx0));
127 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableE, vx1));
128 vx0 = _mm_subs_epi8(vx0, voffset);
129 vx1 = _mm_subs_epi8(vx1, voffset);
130 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableF, vx0));
131 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableF, vx1));
132
133 _mm_storeu_si128((__m128i*) y, vy0);
134 _mm_storeu_si128((__m128i*) (y + 16), vy1);
135 y += 32;
136 }
137 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
138 __m128i vx = _mm_loadu_si128((const __m128i*) x);
139 x += 16;
140
141 __m128i vy = _mm_shuffle_epi8(vtable0, vx);
142
143 vx = _mm_sub_epi8(vx, voffset);
144 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
145 vx = _mm_sub_epi8(vx, voffset);
146 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
147 vx = _mm_sub_epi8(vx, voffset);
148 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
149 vx = _mm_sub_epi8(vx, voffset);
150 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
151 vx = _mm_sub_epi8(vx, voffset);
152 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
153 vx = _mm_sub_epi8(vx, voffset);
154 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
155 vx = _mm_sub_epi8(vx, voffset);
156 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
157 vx = _mm_sub_epi8(vx, voffset);
158 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
159
160 vx = _mm_subs_epi8(vx, voffset);
161 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
162 vx = _mm_subs_epi8(vx, voffset);
163 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
164 vx = _mm_subs_epi8(vx, voffset);
165 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
166 vx = _mm_subs_epi8(vx, voffset);
167 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
168 vx = _mm_subs_epi8(vx, voffset);
169 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
170 vx = _mm_subs_epi8(vx, voffset);
171 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
172 vx = _mm_subs_epi8(vx, voffset);
173 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
174
175 _mm_storeu_si128((__m128i*) y, vy);
176 y += 16;
177 }
178 if XNN_UNLIKELY(n != 0) {
179 __m128i vx = _mm_loadu_si128((const __m128i*) x);
180
181 __m128i vy = _mm_shuffle_epi8(vtable0, vx);
182
183 vx = _mm_sub_epi8(vx, voffset);
184 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
185 vx = _mm_sub_epi8(vx, voffset);
186 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
187 vx = _mm_sub_epi8(vx, voffset);
188 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
189 vx = _mm_sub_epi8(vx, voffset);
190 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
191 vx = _mm_sub_epi8(vx, voffset);
192 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
193 vx = _mm_sub_epi8(vx, voffset);
194 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
195 vx = _mm_sub_epi8(vx, voffset);
196 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
197 vx = _mm_sub_epi8(vx, voffset);
198 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
199
200 vx = _mm_subs_epi8(vx, voffset);
201 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
202 vx = _mm_subs_epi8(vx, voffset);
203 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
204 vx = _mm_subs_epi8(vx, voffset);
205 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
206 vx = _mm_subs_epi8(vx, voffset);
207 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
208 vx = _mm_subs_epi8(vx, voffset);
209 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
210 vx = _mm_subs_epi8(vx, voffset);
211 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
212 vx = _mm_subs_epi8(vx, voffset);
213 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
214
215 if (n & (8 * sizeof(uint8_t))) {
216 _mm_storel_epi64((__m128i*) y, vy);
217 vy = _mm_unpackhi_epi64(vy, vy);
218 y += 8;
219 }
220 if (n & (4 * sizeof(uint8_t))) {
221 *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
222 vy = _mm_srli_epi64(vy, 32);
223 y += 4;
224 }
225 uint32_t vy_lo = (uint32_t) _mm_cvtsi128_si32(vy);
226 if (n & (2 * sizeof(uint8_t))) {
227 *((uint16_t*) y) = (uint16_t) vy_lo;
228 vy_lo >>= 16;
229 y += 2;
230 }
231 if (n & (1 * sizeof(uint8_t))) {
232 *y = (uint8_t) vy_lo;
233 }
234 }
235 }
236