• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transpose/sse2.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <immintrin.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x16_transpose_ukernel__8x8_reuse_switch_sse2(const uint16_t * input,uint16_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x16_transpose_ukernel__8x8_reuse_switch_sse2(
19     const uint16_t* input,
20     uint16_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height)
25 {
26   assert(output_stride >= block_height * sizeof(uint16_t));
27   assert(input_stride >= block_width * sizeof(uint16_t));
28 
29   const size_t tile_height = 8;
30   const size_t tile_width = 8;
31   const size_t tile_hbytes = tile_height * sizeof(uint16_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint16_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint16_t);
35 
36   const uint16_t* i0 = input;
37   uint16_t* o = (uint16_t*) output;
38   const size_t minus_output_stride = -output_stride;
39 
40   do {
41     const size_t rem = min(block_width - 1, 7);
42     const size_t oN_stride = rem * output_stride;
43     size_t bh = block_height;
44     for (; bh >= 8; bh -= 8) {
45       const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
46       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
47       const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i0);
48       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
49       const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i0);
50       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
51       const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i0);
52       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
53       const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i0);
54       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
55       const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i0);
56       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
57       const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i0);
58       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
59       const __m128i v3_7 = _mm_loadu_si128((const __m128i*) i0);
60       i0 = (uint16_t*) ((uintptr_t) i0 + input_stride);
61 
62       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
63       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
64       const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
65       const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
66       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
67       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
68       const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
69       const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
70 
71       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
72       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
73       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
74       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
75       const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
76       const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
77       const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
78       const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
79 
80       const __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
81       const __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
82       const __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
83       const __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
84       const __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
85       const __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
86       const __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
87       const __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
88 
89 
90       uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
91       switch (rem) {
92         case 7:
93           _mm_storeu_si128((__m128i*) oN, v0_7);
94           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
95         case 6:
96           _mm_storeu_si128((__m128i*) oN, v0_6);
97           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
98         case 5:
99           _mm_storeu_si128((__m128i*) oN, v0_5);
100           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
101         case 4:
102           _mm_storeu_si128((__m128i*) oN, v0_4);
103           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
104         case 3:
105           _mm_storeu_si128((__m128i*) oN, v0_3);
106           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
107         case 2:
108           _mm_storeu_si128((__m128i*) oN, v0_2);
109           oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
110         case 1:
111           _mm_storeu_si128((__m128i*) oN, v0_1);
112         case 0:
113           _mm_storeu_si128((__m128i*) o, v0_0);
114           o = (uint16_t*) ((uintptr_t) o + tile_hbytes);
115           break;
116         default:
117           XNN_UNREACHABLE;
118       }
119     }
120     if (bh != 0) {
121       const __m128i v3_0 = _mm_loadu_si128((const __m128i*) i0);
122       const uint16_t *i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
123       if XNN_UNPREDICTABLE(bh < 2) {
124         i1 = i0;
125       }
126       const __m128i v3_1 = _mm_loadu_si128((const __m128i*) i1);
127       const uint16_t *i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
128       if XNN_UNPREDICTABLE(bh <= 2) {
129         i2 = i1;
130       }
131       const __m128i v3_2 = _mm_loadu_si128((const __m128i*) i2);
132       const uint16_t *i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
133       if XNN_UNPREDICTABLE(bh < 4) {
134         i3 = i2;
135       }
136       const __m128i v3_3 = _mm_loadu_si128((const __m128i*) i3);
137       const uint16_t *i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
138       if XNN_UNPREDICTABLE(bh <= 4) {
139         i4 = i3;
140       }
141       const __m128i v3_4 = _mm_loadu_si128((const __m128i*) i4);
142       const uint16_t *i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
143       if XNN_UNPREDICTABLE(bh < 6) {
144         i5 = i4;
145       }
146       const __m128i v3_5 = _mm_loadu_si128((const __m128i*) i5);
147       const uint16_t *i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
148       if XNN_UNPREDICTABLE(bh <= 6) {
149         i6 = i5;
150       }
151       const __m128i v3_6 = _mm_loadu_si128((const __m128i*) i6);
152       const __m128i v3_7 = _mm_undefined_si128();
153 
154       const __m128i v2_0 = _mm_unpacklo_epi16(v3_0, v3_1);
155       const __m128i v2_1 = _mm_unpackhi_epi16(v3_0, v3_1);
156       const __m128i v2_2 = _mm_unpacklo_epi16(v3_2, v3_3);
157       const __m128i v2_3 = _mm_unpackhi_epi16(v3_2, v3_3);
158       const __m128i v2_4 = _mm_unpacklo_epi16(v3_4, v3_5);
159       const __m128i v2_5 = _mm_unpackhi_epi16(v3_4, v3_5);
160       const __m128i v2_6 = _mm_unpacklo_epi16(v3_6, v3_7);
161       const __m128i v2_7 = _mm_unpackhi_epi16(v3_6, v3_7);
162 
163       const __m128i v1_0 = _mm_unpacklo_epi32(v2_0, v2_2);
164       const __m128i v1_1 = _mm_unpackhi_epi32(v2_0, v2_2);
165       const __m128i v1_2 = _mm_unpacklo_epi32(v2_1, v2_3);
166       const __m128i v1_3 = _mm_unpackhi_epi32(v2_1, v2_3);
167       const __m128i v1_4 = _mm_unpacklo_epi32(v2_4, v2_6);
168       const __m128i v1_5 = _mm_unpackhi_epi32(v2_4, v2_6);
169       const __m128i v1_6 = _mm_unpacklo_epi32(v2_5, v2_7);
170       const __m128i v1_7 = _mm_unpackhi_epi32(v2_5, v2_7);
171 
172       __m128i v0_0 = _mm_unpacklo_epi64(v1_0, v1_4);
173       __m128i v0_1 = _mm_unpackhi_epi64(v1_0, v1_4);
174       __m128i v0_2 = _mm_unpacklo_epi64(v1_1, v1_5);
175       __m128i v0_3 = _mm_unpackhi_epi64(v1_1, v1_5);
176       __m128i v0_4 = _mm_unpacklo_epi64(v1_2, v1_6);
177       __m128i v0_5 = _mm_unpackhi_epi64(v1_2, v1_6);
178       __m128i v0_6 = _mm_unpacklo_epi64(v1_3, v1_7);
179       __m128i v0_7 = _mm_unpackhi_epi64(v1_3, v1_7);
180 
181 
182       if (bh & 4) {
183         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
184         switch (rem) {
185           case 7:
186             _mm_storel_epi64((__m128i*) oN, v0_7);
187             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
188           case 6:
189             _mm_storel_epi64((__m128i*) oN, v0_6);
190             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
191           case 5:
192             _mm_storel_epi64((__m128i*) oN, v0_5);
193             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
194           case 4:
195             _mm_storel_epi64((__m128i*) oN, v0_4);
196             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
197           case 3:
198             _mm_storel_epi64((__m128i*) oN, v0_3);
199             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
200           case 2:
201             _mm_storel_epi64((__m128i*) oN, v0_2);
202             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
203           case 1:
204             _mm_storel_epi64((__m128i*) oN, v0_1);
205           case 0:
206             _mm_storel_epi64((__m128i*) o, v0_0);
207             break;
208           default:
209             XNN_UNREACHABLE;
210         }
211         o += 4;
212         v0_0 = _mm_unpackhi_epi64(v0_0, v0_0);
213         v0_1 = _mm_unpackhi_epi64(v0_1, v0_1);
214         v0_2 = _mm_unpackhi_epi64(v0_2, v0_2);
215         v0_3 = _mm_unpackhi_epi64(v0_3, v0_3);
216         v0_4 = _mm_unpackhi_epi64(v0_4, v0_4);
217         v0_5 = _mm_unpackhi_epi64(v0_5, v0_5);
218         v0_6 = _mm_unpackhi_epi64(v0_6, v0_6);
219         v0_7 = _mm_unpackhi_epi64(v0_7, v0_7);
220       }
221 
222       if (bh & 2) {
223         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
224         switch (rem) {
225           case 7:
226             *((int*) oN) = _mm_cvtsi128_si32(v0_7);
227             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
228           case 6:
229             *((int*) oN) = _mm_cvtsi128_si32(v0_6);
230             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
231           case 5:
232             *((int*) oN) = _mm_cvtsi128_si32(v0_5);
233             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
234           case 4:
235             *((int*) oN) = _mm_cvtsi128_si32(v0_4);
236             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
237           case 3:
238             *((int*) oN) = _mm_cvtsi128_si32(v0_3);
239             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
240           case 2:
241             *((int*) oN) = _mm_cvtsi128_si32(v0_2);
242             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
243           case 1:
244             *((int*) oN) = _mm_cvtsi128_si32(v0_1);
245           case 0:
246             *((int*) o) = _mm_cvtsi128_si32(v0_0);
247             break;
248           default:
249             XNN_UNREACHABLE;
250         }
251         o += 2;
252         v0_0 = _mm_srli_epi64(v0_0, 32);
253         v0_1 = _mm_srli_epi64(v0_1, 32);
254         v0_2 = _mm_srli_epi64(v0_2, 32);
255         v0_3 = _mm_srli_epi64(v0_3, 32);
256         v0_4 = _mm_srli_epi64(v0_4, 32);
257         v0_5 = _mm_srli_epi64(v0_5, 32);
258         v0_6 = _mm_srli_epi64(v0_6, 32);
259         v0_7 = _mm_srli_epi64(v0_7, 32);
260       }
261       if (bh & 1) {
262         uint16_t* oN = (uint16_t*) ((uintptr_t) o + oN_stride);
263         switch (rem) {
264           case 7:
265             *((uint16_t*) oN) = (uint16_t) _mm_cvtsi128_si32(v0_7);
266             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
267           case 6:
268             *((uint16_t*) oN) = (uint16_t) _mm_cvtsi128_si32(v0_6);
269             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
270           case 5:
271             *((uint16_t*) oN) = (uint16_t) _mm_cvtsi128_si32(v0_5);
272             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
273           case 4:
274             *((uint16_t*) oN) = (uint16_t) _mm_cvtsi128_si32(v0_4);
275             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
276           case 3:
277             *((uint16_t*) oN) = (uint16_t) _mm_cvtsi128_si32(v0_3);
278             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
279           case 2:
280             *((uint16_t*) oN) = (uint16_t) _mm_cvtsi128_si32(v0_2);
281             oN = (uint16_t*) ((uintptr_t) oN + minus_output_stride);
282           case 1:
283             *((uint16_t*) oN) = (uint16_t) _mm_cvtsi128_si32(v0_1);
284           case 0:
285             *((uint16_t*) o) = (uint16_t) _mm_cvtsi128_si32(v0_0);
286             break;
287           default:
288             XNN_UNREACHABLE;
289         }
290       }
291     }
292 
293     i0 = (const uint16_t*) ((uintptr_t) i0 + input_reset);
294     o = (uint16_t*) ((uintptr_t) o + output_reset);
295     block_width = doz(block_width, tile_width);
296   } while (block_width != 0);
297 }
298