1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
13 #define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
14
15 #include <emmintrin.h> // SSE2
16
17 #include "config/aom_config.h"
18
transpose_8bit_4x4(const __m128i * const in)19 static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
20 // Unpack 16 bit elements. Goes from:
21 // in[0]: 00 01 02 03
22 // in[1]: 10 11 12 13
23 // in[2]: 20 21 22 23
24 // in[3]: 30 31 32 33
25 // to:
26 // a0: 00 10 01 11 02 12 03 13
27 // a1: 20 30 21 31 22 32 23 33
28 const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
29 const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
30
31 // Unpack 32 bit elements resulting in:
32 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
33 return _mm_unpacklo_epi16(a0, a1);
34 }
35
transpose_8bit_8x8(const __m128i * const in,__m128i * const out)36 static INLINE void transpose_8bit_8x8(const __m128i *const in,
37 __m128i *const out) {
38 // Unpack 8 bit elements. Goes from:
39 // in[0]: 00 01 02 03 04 05 06 07
40 // in[1]: 10 11 12 13 14 15 16 17
41 // in[2]: 20 21 22 23 24 25 26 27
42 // in[3]: 30 31 32 33 34 35 36 37
43 // in[4]: 40 41 42 43 44 45 46 47
44 // in[5]: 50 51 52 53 54 55 56 57
45 // in[6]: 60 61 62 63 64 65 66 67
46 // in[7]: 70 71 72 73 74 75 76 77
47 // to:
48 // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
49 // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
50 // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
51 // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
52 const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
53 const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
54 const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
55 const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
56
57 // Unpack 16 bit elements resulting in:
58 // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
59 // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
60 // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
61 // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
62 const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
63 const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
64 const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
65 const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
66
67 // Unpack 32 bit elements resulting in:
68 // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
69 // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
70 // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
71 // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
72 const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
73 const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
74 const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
75 const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
76
77 // Unpack 64 bit elements resulting in:
78 // out[0]: 00 10 20 30 40 50 60 70
79 // out[1]: 01 11 21 31 41 51 61 71
80 // out[2]: 02 12 22 32 42 52 62 72
81 // out[3]: 03 13 23 33 43 53 63 73
82 // out[4]: 04 14 24 34 44 54 64 74
83 // out[5]: 05 15 25 35 45 55 65 75
84 // out[6]: 06 16 26 36 46 56 66 76
85 // out[7]: 07 17 27 37 47 57 67 77
86 out[0] = _mm_unpacklo_epi64(c0, c0);
87 out[1] = _mm_unpackhi_epi64(c0, c0);
88 out[2] = _mm_unpacklo_epi64(c1, c1);
89 out[3] = _mm_unpackhi_epi64(c1, c1);
90 out[4] = _mm_unpacklo_epi64(c2, c2);
91 out[5] = _mm_unpackhi_epi64(c2, c2);
92 out[6] = _mm_unpacklo_epi64(c3, c3);
93 out[7] = _mm_unpackhi_epi64(c3, c3);
94 }
95
transpose_16bit_4x4(const __m128i * const in,__m128i * const out)96 static INLINE void transpose_16bit_4x4(const __m128i *const in,
97 __m128i *const out) {
98 // Unpack 16 bit elements. Goes from:
99 // in[0]: 00 01 02 03 XX XX XX XX
100 // in[1]: 10 11 12 13 XX XX XX XX
101 // in[2]: 20 21 22 23 XX XX XX XX
102 // in[3]: 30 31 32 33 XX XX XX XX
103 // to:
104 // a0: 00 10 01 11 02 12 03 13
105 // a1: 20 30 21 31 22 32 23 33
106 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
107 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
108
109 // Unpack 32 bit elements resulting in:
110 // out[0]: 00 10 20 30
111 // out[1]: 01 11 21 31
112 // out[2]: 02 12 22 32
113 // out[3]: 03 13 23 33
114 out[0] = _mm_unpacklo_epi32(a0, a1);
115 out[1] = _mm_srli_si128(out[0], 8);
116 out[2] = _mm_unpackhi_epi32(a0, a1);
117 out[3] = _mm_srli_si128(out[2], 8);
118 }
119
transpose_16bit_4x8(const __m128i * const in,__m128i * const out)120 static INLINE void transpose_16bit_4x8(const __m128i *const in,
121 __m128i *const out) {
122 // Unpack 16 bit elements. Goes from:
123 // in[0]: 00 01 02 03 XX XX XX XX
124 // in[1]: 10 11 12 13 XX XX XX XX
125 // in[2]: 20 21 22 23 XX XX XX XX
126 // in[3]: 30 31 32 33 XX XX XX XX
127 // in[4]: 40 41 42 43 XX XX XX XX
128 // in[5]: 50 51 52 53 XX XX XX XX
129 // in[6]: 60 61 62 63 XX XX XX XX
130 // in[7]: 70 71 72 73 XX XX XX XX
131 // to:
132 // a0: 00 10 01 11 02 12 03 13
133 // a1: 20 30 21 31 22 32 23 33
134 // a2: 40 50 41 51 42 52 43 53
135 // a3: 60 70 61 71 62 72 63 73
136 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
137 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
138 const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
139 const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
140
141 // Unpack 32 bit elements resulting in:
142 // b0: 00 10 20 30 01 11 21 31
143 // b1: 40 50 60 70 41 51 61 71
144 // b2: 02 12 22 32 03 13 23 33
145 // b3: 42 52 62 72 43 53 63 73
146 const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
147 const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
148 const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
149 const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
150
151 // Unpack 64 bit elements resulting in:
152 // out[0]: 00 10 20 30 40 50 60 70
153 // out[1]: 01 11 21 31 41 51 61 71
154 // out[2]: 02 12 22 32 42 52 62 72
155 // out[3]: 03 13 23 33 43 53 63 73
156 out[0] = _mm_unpacklo_epi64(b0, b1);
157 out[1] = _mm_unpackhi_epi64(b0, b1);
158 out[2] = _mm_unpacklo_epi64(b2, b3);
159 out[3] = _mm_unpackhi_epi64(b2, b3);
160 }
161
transpose_16bit_8x4(const __m128i * const in,__m128i * const out)162 static INLINE void transpose_16bit_8x4(const __m128i *const in,
163 __m128i *const out) {
164 // Unpack 16 bit elements. Goes from:
165 // in[0]: 00 01 02 03 04 05 06 07
166 // in[1]: 10 11 12 13 14 15 16 17
167 // in[2]: 20 21 22 23 24 25 26 27
168 // in[3]: 30 31 32 33 34 35 36 37
169
170 // to:
171 // a0: 00 10 01 11 02 12 03 13
172 // a1: 20 30 21 31 22 32 23 33
173 // a4: 04 14 05 15 06 16 07 17
174 // a5: 24 34 25 35 26 36 27 37
175 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
176 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
177 const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
178 const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
179
180 // Unpack 32 bit elements resulting in:
181 // b0: 00 10 20 30 01 11 21 31
182 // b2: 04 14 24 34 05 15 25 35
183 // b4: 02 12 22 32 03 13 23 33
184 // b6: 06 16 26 36 07 17 27 37
185 const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
186 const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
187 const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
188 const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
189
190 // Unpack 64 bit elements resulting in:
191 // out[0]: 00 10 20 30 XX XX XX XX
192 // out[1]: 01 11 21 31 XX XX XX XX
193 // out[2]: 02 12 22 32 XX XX XX XX
194 // out[3]: 03 13 23 33 XX XX XX XX
195 // out[4]: 04 14 24 34 XX XX XX XX
196 // out[5]: 05 15 25 35 XX XX XX XX
197 // out[6]: 06 16 26 36 XX XX XX XX
198 // out[7]: 07 17 27 37 XX XX XX XX
199 const __m128i zeros = _mm_setzero_si128();
200 out[0] = _mm_unpacklo_epi64(b0, zeros);
201 out[1] = _mm_unpackhi_epi64(b0, zeros);
202 out[2] = _mm_unpacklo_epi64(b4, zeros);
203 out[3] = _mm_unpackhi_epi64(b4, zeros);
204 out[4] = _mm_unpacklo_epi64(b2, zeros);
205 out[5] = _mm_unpackhi_epi64(b2, zeros);
206 out[6] = _mm_unpacklo_epi64(b6, zeros);
207 out[7] = _mm_unpackhi_epi64(b6, zeros);
208 }
209
transpose_16bit_8x8(const __m128i * const in,__m128i * const out)210 static INLINE void transpose_16bit_8x8(const __m128i *const in,
211 __m128i *const out) {
212 // Unpack 16 bit elements. Goes from:
213 // in[0]: 00 01 02 03 04 05 06 07
214 // in[1]: 10 11 12 13 14 15 16 17
215 // in[2]: 20 21 22 23 24 25 26 27
216 // in[3]: 30 31 32 33 34 35 36 37
217 // in[4]: 40 41 42 43 44 45 46 47
218 // in[5]: 50 51 52 53 54 55 56 57
219 // in[6]: 60 61 62 63 64 65 66 67
220 // in[7]: 70 71 72 73 74 75 76 77
221 // to:
222 // a0: 00 10 01 11 02 12 03 13
223 // a1: 20 30 21 31 22 32 23 33
224 // a2: 40 50 41 51 42 52 43 53
225 // a3: 60 70 61 71 62 72 63 73
226 // a4: 04 14 05 15 06 16 07 17
227 // a5: 24 34 25 35 26 36 27 37
228 // a6: 44 54 45 55 46 56 47 57
229 // a7: 64 74 65 75 66 76 67 77
230 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
231 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
232 const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
233 const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
234 const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
235 const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
236 const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
237 const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
238
239 // Unpack 32 bit elements resulting in:
240 // b0: 00 10 20 30 01 11 21 31
241 // b1: 40 50 60 70 41 51 61 71
242 // b2: 04 14 24 34 05 15 25 35
243 // b3: 44 54 64 74 45 55 65 75
244 // b4: 02 12 22 32 03 13 23 33
245 // b5: 42 52 62 72 43 53 63 73
246 // b6: 06 16 26 36 07 17 27 37
247 // b7: 46 56 66 76 47 57 67 77
248 const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
249 const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
250 const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
251 const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
252 const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
253 const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
254 const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
255 const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
256
257 // Unpack 64 bit elements resulting in:
258 // out[0]: 00 10 20 30 40 50 60 70
259 // out[1]: 01 11 21 31 41 51 61 71
260 // out[2]: 02 12 22 32 42 52 62 72
261 // out[3]: 03 13 23 33 43 53 63 73
262 // out[4]: 04 14 24 34 44 54 64 74
263 // out[5]: 05 15 25 35 45 55 65 75
264 // out[6]: 06 16 26 36 46 56 66 76
265 // out[7]: 07 17 27 37 47 57 67 77
266 out[0] = _mm_unpacklo_epi64(b0, b1);
267 out[1] = _mm_unpackhi_epi64(b0, b1);
268 out[2] = _mm_unpacklo_epi64(b4, b5);
269 out[3] = _mm_unpackhi_epi64(b4, b5);
270 out[4] = _mm_unpacklo_epi64(b2, b3);
271 out[5] = _mm_unpackhi_epi64(b2, b3);
272 out[6] = _mm_unpacklo_epi64(b6, b7);
273 out[7] = _mm_unpackhi_epi64(b6, b7);
274 }
275
276 // Transpose in-place
transpose_16bit_16x16(__m128i * const left,__m128i * const right)277 static INLINE void transpose_16bit_16x16(__m128i *const left,
278 __m128i *const right) {
279 __m128i tbuf[8];
280 transpose_16bit_8x8(left, left);
281 transpose_16bit_8x8(right, tbuf);
282 transpose_16bit_8x8(left + 8, right);
283 transpose_16bit_8x8(right + 8, right + 8);
284
285 left[8] = tbuf[0];
286 left[9] = tbuf[1];
287 left[10] = tbuf[2];
288 left[11] = tbuf[3];
289 left[12] = tbuf[4];
290 left[13] = tbuf[5];
291 left[14] = tbuf[6];
292 left[15] = tbuf[7];
293 }
294
transpose_32bit_4x4(const __m128i * const in,__m128i * const out)295 static INLINE void transpose_32bit_4x4(const __m128i *const in,
296 __m128i *const out) {
297 // Unpack 32 bit elements. Goes from:
298 // in[0]: 00 01 02 03
299 // in[1]: 10 11 12 13
300 // in[2]: 20 21 22 23
301 // in[3]: 30 31 32 33
302 // to:
303 // a0: 00 10 01 11
304 // a1: 20 30 21 31
305 // a2: 02 12 03 13
306 // a3: 22 32 23 33
307
308 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
309 const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
310 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
311 const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
312
313 // Unpack 64 bit elements resulting in:
314 // out[0]: 00 10 20 30
315 // out[1]: 01 11 21 31
316 // out[2]: 02 12 22 32
317 // out[3]: 03 13 23 33
318 out[0] = _mm_unpacklo_epi64(a0, a1);
319 out[1] = _mm_unpackhi_epi64(a0, a1);
320 out[2] = _mm_unpacklo_epi64(a2, a3);
321 out[3] = _mm_unpackhi_epi64(a2, a3);
322 }
323
transpose_32bit_4x4x2(const __m128i * const in,__m128i * const out)324 static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
325 __m128i *const out) {
326 // Unpack 32 bit elements. Goes from:
327 // in[0]: 00 01 02 03
328 // in[1]: 10 11 12 13
329 // in[2]: 20 21 22 23
330 // in[3]: 30 31 32 33
331 // in[4]: 04 05 06 07
332 // in[5]: 14 15 16 17
333 // in[6]: 24 25 26 27
334 // in[7]: 34 35 36 37
335 // to:
336 // a0: 00 10 01 11
337 // a1: 20 30 21 31
338 // a2: 02 12 03 13
339 // a3: 22 32 23 33
340 // a4: 04 14 05 15
341 // a5: 24 34 25 35
342 // a6: 06 16 07 17
343 // a7: 26 36 27 37
344 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
345 const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
346 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
347 const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
348 const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
349 const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
350 const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
351 const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
352
353 // Unpack 64 bit elements resulting in:
354 // out[0]: 00 10 20 30
355 // out[1]: 01 11 21 31
356 // out[2]: 02 12 22 32
357 // out[3]: 03 13 23 33
358 // out[4]: 04 14 24 34
359 // out[5]: 05 15 25 35
360 // out[6]: 06 16 26 36
361 // out[7]: 07 17 27 37
362 out[0] = _mm_unpacklo_epi64(a0, a1);
363 out[1] = _mm_unpackhi_epi64(a0, a1);
364 out[2] = _mm_unpacklo_epi64(a2, a3);
365 out[3] = _mm_unpackhi_epi64(a2, a3);
366 out[4] = _mm_unpacklo_epi64(a4, a5);
367 out[5] = _mm_unpackhi_epi64(a4, a5);
368 out[6] = _mm_unpacklo_epi64(a6, a7);
369 out[7] = _mm_unpackhi_epi64(a6, a7);
370 }
371
transpose_32bit_8x4(const __m128i * const in,__m128i * const out)372 static INLINE void transpose_32bit_8x4(const __m128i *const in,
373 __m128i *const out) {
374 // Unpack 32 bit elements. Goes from:
375 // in[0]: 00 01 02 03
376 // in[1]: 04 05 06 07
377 // in[2]: 10 11 12 13
378 // in[3]: 14 15 16 17
379 // in[4]: 20 21 22 23
380 // in[5]: 24 25 26 27
381 // in[6]: 30 31 32 33
382 // in[7]: 34 35 36 37
383 // to:
384 // a0: 00 10 01 11
385 // a1: 20 30 21 31
386 // a2: 02 12 03 13
387 // a3: 22 32 23 33
388 // a4: 04 14 05 15
389 // a5: 24 34 25 35
390 // a6: 06 16 07 17
391 // a7: 26 36 27 37
392 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
393 const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
394 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
395 const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
396 const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
397 const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
398 const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
399 const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
400
401 // Unpack 64 bit elements resulting in:
402 // out[0]: 00 10 20 30
403 // out[1]: 01 11 21 31
404 // out[2]: 02 12 22 32
405 // out[3]: 03 13 23 33
406 // out[4]: 04 14 24 34
407 // out[5]: 05 15 25 35
408 // out[6]: 06 16 26 36
409 // out[7]: 07 17 27 37
410 out[0] = _mm_unpacklo_epi64(a0, a1);
411 out[1] = _mm_unpackhi_epi64(a0, a1);
412 out[2] = _mm_unpacklo_epi64(a2, a3);
413 out[3] = _mm_unpackhi_epi64(a2, a3);
414 out[4] = _mm_unpacklo_epi64(a4, a5);
415 out[5] = _mm_unpackhi_epi64(a4, a5);
416 out[6] = _mm_unpacklo_epi64(a6, a7);
417 out[7] = _mm_unpackhi_epi64(a6, a7);
418 }
419
420 #endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
421