1 // Auto-generated file. Do not edit!
2 // Template: src/x32-transpose/sse2.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <immintrin.h>
11
12 #include <assert.h>
13
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17
xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2(const uint16_t * input,uint16_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2(
19 const uint16_t* input,
20 uint16_t* output,
21 size_t input_stride,
22 size_t output_stride,
23 size_t block_width,
24 size_t block_height)
25 {
26 assert(output_stride >= block_height * sizeof(uint16_t));
27 assert(input_stride >= block_width * sizeof(uint16_t));
28
29 const size_t tile_height = 8;
30 const size_t tile_width = 8;
31 const size_t tile_hbytes = tile_height * sizeof(uint16_t);
32 const size_t tile_wbytes = tile_width * sizeof(uint16_t);
33 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t) - tile_hbytes;
35
36 const uint16_t* i0 = input;
37 uint16_t* o = (uint16_t*) ((uintptr_t) output - tile_hbytes);
38 const size_t minus_output_stride = -output_stride;
39
40 do {
41 const size_t rem = min(block_width - 1, 7);
42 const size_t oN_stride = rem * output_stride;
43 const size_t oN_offset = oN_stride + tile_hbytes;
44 size_t bh = block_height;
45 for (; bh >= 8; bh -= 8) {
46 const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
47 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
48 const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i0);
49 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
50 const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i0);
51 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
52 const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i0);
53 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
54 const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i0);
55 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
56 const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i0);
57 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
58 const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i0);
59 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
60 const __m128i v3_7 = _mm_loadu_si128((const __m128i*) i0);
61 i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
62
63 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
64 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
65 const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
66 const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
67 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
68 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
69 const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
70 const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
71
72 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
73 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
74 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
75 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
76 const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
77 const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
78 const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
79 const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
80
81 const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
82 const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
83 const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
84 const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
85 const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
86 const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
87 const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
88 const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
89
90
91 o = (uint16_t*) ((uintptr_t) o + oN_offset);
92 _mm_storeu_si128((__m128i*) o, v0_7);
93 uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
94 if XNN_UNPREDICTABLE(block_width > 7) {
95 o = oN;
96 }
97 _mm_storeu_si128((__m128i*) o, v0_6);
98 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
99 if XNN_UNPREDICTABLE(block_width >= 7) {
100 o = oN;
101 }
102 _mm_storeu_si128((__m128i*) o, v0_5);
103 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
104 if XNN_UNPREDICTABLE(block_width > 5) {
105 o = oN;
106 }
107 _mm_storeu_si128((__m128i*) o, v0_4);
108 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
109 if XNN_UNPREDICTABLE(block_width >= 5) {
110 o = oN;
111 }
112 _mm_storeu_si128((__m128i*) o, v0_3);
113 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
114 if XNN_UNPREDICTABLE(block_width > 3) {
115 o = oN;
116 }
117 _mm_storeu_si128((__m128i*) o, v0_2);
118 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
119 if XNN_UNPREDICTABLE(block_width >= 3) {
120 o = oN;
121 }
122 _mm_storeu_si128((__m128i*) o, v0_1);
123 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
124 if XNN_UNPREDICTABLE(block_width > 1) {
125 o = oN;
126 }
127 _mm_storeu_si128((__m128i*) o, v0_0);
128 }
129 o = (uint16_t*) ((uintptr_t) o + tile_hbytes);
130 if (bh != 0) {
131 const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
132 const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
133 if XNN_UNPREDICTABLE(bh < 2) {
134 i1 = i0;
135 }
136 const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i1);
137 const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
138 if XNN_UNPREDICTABLE(bh <= 2) {
139 i2 = i1;
140 }
141 const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i2);
142 const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
143 if XNN_UNPREDICTABLE(bh < 4) {
144 i3 = i2;
145 }
146 const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i3);
147 const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
148 if XNN_UNPREDICTABLE(bh <= 4) {
149 i4 = i3;
150 }
151 const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i4);
152 const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
153 if XNN_UNPREDICTABLE(bh < 6) {
154 i5 = i4;
155 }
156 const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i5);
157 const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
158 if XNN_UNPREDICTABLE(bh <= 6) {
159 i6 = i5;
160 }
161 const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i6);
162 const __m128i v3_7 = _mm_undefined_si128();
163
164 const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
165 const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
166 const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
167 const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
168 const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
169 const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
170 const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
171 const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
172
173 const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
174 const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
175 const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
176 const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
177 const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
178 const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
179 const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
180 const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
181
182 __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
183 __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
184 __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
185 __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
186 __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
187 __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
188 __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
189 __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
190
191
192 if (bh & 4) {
193 o = (uint16_t*) ((uintptr_t) o + oN_stride);
194 _mm_storel_epi64((__m128i*) o, v0_7);
195 uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
196 if XNN_UNPREDICTABLE(block_width > 7) {
197 o = oN;
198 }
199 _mm_storel_epi64((__m128i*) o, v0_6);
200 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
201 if XNN_UNPREDICTABLE(block_width >= 7) {
202 o = oN;
203 }
204 _mm_storel_epi64((__m128i*) o, v0_5);
205 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
206 if XNN_UNPREDICTABLE(block_width > 5) {
207 o = oN;
208 }
209 _mm_storel_epi64((__m128i*) o, v0_4);
210 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
211 if XNN_UNPREDICTABLE(block_width >= 5) {
212 o = oN;
213 }
214 _mm_storel_epi64((__m128i*) o, v0_3);
215 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
216 if XNN_UNPREDICTABLE(block_width > 3) {
217 o = oN;
218 }
219 _mm_storel_epi64((__m128i*) o, v0_2);
220 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
221 if XNN_UNPREDICTABLE(block_width >= 3) {
222 o = oN;
223 }
224 _mm_storel_epi64((__m128i*) o, v0_1);
225 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
226 if XNN_UNPREDICTABLE(block_width > 1) {
227 o = oN;
228 }
229 _mm_storel_epi64((__m128i*) o, v0_0);
230 o += 4;
231 v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
232 v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
233 v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
234 v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
235 v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
236 v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
237 v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
238 v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
239 }
240
241 if (bh & 2) {
242 o = (uint16_t*) ((uintptr_t) o + oN_stride);
243 *((int*) o) = _mm_cvtsi128_si32(v0_7);
244 uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
245 if XNN_UNPREDICTABLE(block_width > 7) {
246 o = oN;
247 }
248 *((int*) o) = _mm_cvtsi128_si32(v0_6);
249 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
250 if XNN_UNPREDICTABLE(block_width >= 7) {
251 o = oN;
252 }
253 *((int*) o) = _mm_cvtsi128_si32(v0_5);
254 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
255 if XNN_UNPREDICTABLE(block_width > 5) {
256 o = oN;
257 }
258 *((int*) o) = _mm_cvtsi128_si32(v0_4);
259 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
260 if XNN_UNPREDICTABLE(block_width >= 5) {
261 o = oN;
262 }
263 *((int*) o) = _mm_cvtsi128_si32(v0_3);
264 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
265 if XNN_UNPREDICTABLE(block_width > 3) {
266 o = oN;
267 }
268 *((int*) o) = _mm_cvtsi128_si32(v0_2);
269 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
270 if XNN_UNPREDICTABLE(block_width >= 3) {
271 o = oN;
272 }
273 *((int*) o) = _mm_cvtsi128_si32(v0_1);
274 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
275 if XNN_UNPREDICTABLE(block_width > 1) {
276 o = oN;
277 }
278 *((int*) o) = _mm_cvtsi128_si32(v0_0);
279 o += 2;
280 v0_0 = _mm_srli_epi64(v0_0, 32);
281 v0_1 = _mm_srli_epi64(v0_1, 32);
282 v0_2 = _mm_srli_epi64(v0_2, 32);
283 v0_3 = _mm_srli_epi64(v0_3, 32);
284 v0_4 = _mm_srli_epi64(v0_4, 32);
285 v0_5 = _mm_srli_epi64(v0_5, 32);
286 v0_6 = _mm_srli_epi64(v0_6, 32);
287 v0_7 = _mm_srli_epi64(v0_7, 32);
288 }
289 if (bh & 1) {
290 o = (uint16_t*) ((uintptr_t) o + oN_stride);
291 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_7);
292 uint16_t* oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
293 if XNN_UNPREDICTABLE(block_width > 7) {
294 o = oN;
295 }
296 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_6);
297 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
298 if XNN_UNPREDICTABLE(block_width >= 7) {
299 o = oN;
300 }
301 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_5);
302 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
303 if XNN_UNPREDICTABLE(block_width > 5) {
304 o = oN;
305 }
306 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_4);
307 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
308 if XNN_UNPREDICTABLE(block_width >= 5) {
309 o = oN;
310 }
311 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_3);
312 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
313 if XNN_UNPREDICTABLE(block_width > 3) {
314 o = oN;
315 }
316 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_2);
317 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
318 if XNN_UNPREDICTABLE(block_width >= 3) {
319 o = oN;
320 }
321 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_1);
322 oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
323 if XNN_UNPREDICTABLE(block_width > 1) {
324 o = oN;
325 }
326 *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_0);
327 }
328 }
329
330 i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
331 o = (uint16_t*) ((uintptr_t) o + output_reset);
332 block_width = doz(block_width, tile_width);
333 } while (block_width != 0);
334 }
335