• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transpose/sse2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <immintrin.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2(const uint16_t * input,uint16_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x16_transpose_ukernel__8x8_reuse_mov_sse2(
19     const uint16_t* input,
20     uint16_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height)
25 {
26   assert(output_stride >= block_height * sizeof(uint16_t));
27   assert(input_stride >= block_width * sizeof(uint16_t));
28 
29   const size_t tile_height = 8;
30   const size_t tile_width = 8;
31   const size_t tile_hbytes = tile_height * sizeof(uint16_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint16_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t) - tile_hbytes;
35 
36   const uint16_t* i0 = input;
37   uint16_t* o = (uint16_t*) ((uintptr_t) output - tile_hbytes);
38   const size_t minus_output_stride = -output_stride;
39 
40   do {
41     const size_t rem = min(block_width - 1, 7);
42     const size_t oN_stride = rem * output_stride;
43     const size_t oN_offset = oN_stride + tile_hbytes;
44     size_t bh = block_height;
45     for (; bh >= 8; bh -= 8) {
46       const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
47       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
48       const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i0);
49       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
50       const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i0);
51       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
52       const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i0);
53       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
54       const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i0);
55       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
56       const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i0);
57       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
58       const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i0);
59       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
60       const __m128i v3_7 = _mm_loadu_si128((const __m128i*) i0);
61       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
62 
63       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
64       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
65       const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
66       const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
67       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
68       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
69       const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
70       const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
71 
72       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
73       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
74       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
75       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
76       const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
77       const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
78       const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
79       const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
80 
81       const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
82       const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
83       const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
84       const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
85       const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
86       const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
87       const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
88       const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
89 
90 
91       o = (uint16_t*) ((uintptr_t) o + oN_offset);
92       _mm_storeu_si128((__m128i*) o, v0_7);
93       uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
94       if XNN_UNPREDICTABLE(block_width > 7) {
95         o = oN;
96       }
97       _mm_storeu_si128((__m128i*) o, v0_6);
98       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
99       if XNN_UNPREDICTABLE(block_width >= 7) {
100         o = oN;
101       }
102       _mm_storeu_si128((__m128i*) o, v0_5);
103       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
104       if XNN_UNPREDICTABLE(block_width > 5) {
105         o = oN;
106       }
107       _mm_storeu_si128((__m128i*) o, v0_4);
108       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
109       if XNN_UNPREDICTABLE(block_width >= 5) {
110         o = oN;
111       }
112       _mm_storeu_si128((__m128i*) o, v0_3);
113       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
114       if XNN_UNPREDICTABLE(block_width > 3) {
115         o = oN;
116       }
117       _mm_storeu_si128((__m128i*) o, v0_2);
118       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
119       if XNN_UNPREDICTABLE(block_width >= 3) {
120         o = oN;
121       }
122       _mm_storeu_si128((__m128i*) o, v0_1);
123       oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
124       if XNN_UNPREDICTABLE(block_width > 1) {
125         o = oN;
126       }
127       _mm_storeu_si128((__m128i*) o, v0_0);
128     }
129     o = (uint16_t*) ((uintptr_t) o + tile_hbytes);
130     if (bh != 0) {
131       const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
132       const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
133       if XNN_UNPREDICTABLE(bh < 2) {
134         i1 = i0;
135       }
136       const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i1);
137       const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
138       if XNN_UNPREDICTABLE(bh <= 2) {
139         i2 = i1;
140       }
141       const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i2);
142       const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
143       if XNN_UNPREDICTABLE(bh < 4) {
144         i3 = i2;
145       }
146       const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i3);
147       const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
148       if XNN_UNPREDICTABLE(bh <= 4) {
149         i4 = i3;
150       }
151       const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i4);
152       const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
153       if XNN_UNPREDICTABLE(bh < 6) {
154         i5 = i4;
155       }
156       const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i5);
157       const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
158       if XNN_UNPREDICTABLE(bh <= 6) {
159         i6 = i5;
160       }
161       const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i6);
162       const __m128i v3_7 = _mm_undefined_si128();
163 
164       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
165       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
166       const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
167       const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
168       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
169       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
170       const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
171       const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
172 
173       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
174       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
175       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
176       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
177       const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
178       const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
179       const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
180       const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
181 
182       __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
183       __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
184       __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
185       __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
186       __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
187       __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
188       __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
189       __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
190 
191 
192       if (bh & 4) {
193         o = (uint16_t*) ((uintptr_t) o + oN_stride);
194         _mm_storel_epi64((__m128i*) o, v0_7);
195         uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
196         if XNN_UNPREDICTABLE(block_width > 7) {
197           o = oN;
198         }
199         _mm_storel_epi64((__m128i*) o, v0_6);
200         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
201         if XNN_UNPREDICTABLE(block_width >= 7) {
202           o = oN;
203         }
204         _mm_storel_epi64((__m128i*) o, v0_5);
205         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
206         if XNN_UNPREDICTABLE(block_width > 5) {
207           o = oN;
208         }
209         _mm_storel_epi64((__m128i*) o, v0_4);
210         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
211         if XNN_UNPREDICTABLE(block_width >= 5) {
212           o = oN;
213         }
214         _mm_storel_epi64((__m128i*) o, v0_3);
215         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
216         if XNN_UNPREDICTABLE(block_width > 3) {
217           o = oN;
218         }
219         _mm_storel_epi64((__m128i*) o, v0_2);
220         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
221         if XNN_UNPREDICTABLE(block_width >= 3) {
222           o = oN;
223         }
224         _mm_storel_epi64((__m128i*) o, v0_1);
225         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
226         if XNN_UNPREDICTABLE(block_width > 1) {
227           o = oN;
228         }
229         _mm_storel_epi64((__m128i*) o, v0_0);
230         o += 4;
231         v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
232         v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
233         v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
234         v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
235         v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
236         v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
237         v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
238         v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
239       }
240 
241       if (bh & 2) {
242         o = (uint16_t*) ((uintptr_t) o + oN_stride);
243         *((int*) o) = _mm_cvtsi128_si32(v0_7);
244         uint16_t *oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
245         if XNN_UNPREDICTABLE(block_width > 7) {
246           o = oN;
247         }
248         *((int*) o) = _mm_cvtsi128_si32(v0_6);
249         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
250         if XNN_UNPREDICTABLE(block_width >= 7) {
251           o = oN;
252         }
253         *((int*) o) = _mm_cvtsi128_si32(v0_5);
254         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
255         if XNN_UNPREDICTABLE(block_width > 5) {
256           o = oN;
257         }
258         *((int*) o) = _mm_cvtsi128_si32(v0_4);
259         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
260         if XNN_UNPREDICTABLE(block_width >= 5) {
261           o = oN;
262         }
263         *((int*) o) = _mm_cvtsi128_si32(v0_3);
264         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
265         if XNN_UNPREDICTABLE(block_width > 3) {
266           o = oN;
267         }
268         *((int*) o) = _mm_cvtsi128_si32(v0_2);
269         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
270         if XNN_UNPREDICTABLE(block_width >= 3) {
271           o = oN;
272         }
273         *((int*) o) = _mm_cvtsi128_si32(v0_1);
274         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
275         if XNN_UNPREDICTABLE(block_width > 1) {
276           o = oN;
277         }
278         *((int*) o) = _mm_cvtsi128_si32(v0_0);
279         o += 2;
280         v0_0 = _mm_srli_epi64(v0_0, 32);
281         v0_1 = _mm_srli_epi64(v0_1, 32);
282         v0_2 = _mm_srli_epi64(v0_2, 32);
283         v0_3 = _mm_srli_epi64(v0_3, 32);
284         v0_4 = _mm_srli_epi64(v0_4, 32);
285         v0_5 = _mm_srli_epi64(v0_5, 32);
286         v0_6 = _mm_srli_epi64(v0_6, 32);
287         v0_7 = _mm_srli_epi64(v0_7, 32);
288       }
289       if (bh & 1) {
290         o = (uint16_t*) ((uintptr_t) o + oN_stride);
291         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_7);
292         uint16_t* oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
293         if XNN_UNPREDICTABLE(block_width > 7) {
294           o = oN;
295         }
296         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_6);
297         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
298         if XNN_UNPREDICTABLE(block_width >= 7) {
299           o = oN;
300         }
301         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_5);
302         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
303         if XNN_UNPREDICTABLE(block_width > 5) {
304           o = oN;
305         }
306         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_4);
307         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
308         if XNN_UNPREDICTABLE(block_width >= 5) {
309           o = oN;
310         }
311         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_3);
312         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
313         if XNN_UNPREDICTABLE(block_width > 3) {
314           o = oN;
315         }
316         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_2);
317         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
318         if XNN_UNPREDICTABLE(block_width >= 3) {
319           o = oN;
320         }
321         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_1);
322         oN = (uint16_t*) ((uintptr_t) o + minus_output_stride);
323         if XNN_UNPREDICTABLE(block_width > 1) {
324           o = oN;
325         }
326         *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_0);
327       }
328     }
329 
330     i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
331     o = (uint16_t*) ((uintptr_t) o + output_reset);
332     block_width = doz(block_width, tile_width);
333   } while (block_width != 0);
334 }
335