1 /*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
13 #define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
14
15 #include <emmintrin.h> // SSE2
16
17 #include "config/aom_config.h"
18
transpose_8bit_4x4(const __m128i * const in)19 static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
20 // Unpack 8 bit elements. Goes from:
21 // in[0]: 00 01 02 03
22 // in[1]: 10 11 12 13
23 // in[2]: 20 21 22 23
24 // in[3]: 30 31 32 33
25 // to:
26 // a0: 00 10 01 11 02 12 03 13
27 // a1: 20 30 21 31 22 32 23 33
28 const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
29 const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
30
31 // Unpack 16 bit elements resulting in:
32 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
33 return _mm_unpacklo_epi16(a0, a1);
34 }
35
transpose_8bit_8x8(const __m128i * const in,__m128i * const out)36 static INLINE void transpose_8bit_8x8(const __m128i *const in,
37 __m128i *const out) {
38 // Unpack 8 bit elements. Goes from:
39 // in[0]: 00 01 02 03 04 05 06 07
40 // in[1]: 10 11 12 13 14 15 16 17
41 // in[2]: 20 21 22 23 24 25 26 27
42 // in[3]: 30 31 32 33 34 35 36 37
43 // in[4]: 40 41 42 43 44 45 46 47
44 // in[5]: 50 51 52 53 54 55 56 57
45 // in[6]: 60 61 62 63 64 65 66 67
46 // in[7]: 70 71 72 73 74 75 76 77
47 // to:
48 // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
49 // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
50 // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
51 // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
52 const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
53 const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
54 const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
55 const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
56
57 // Unpack 16 bit elements resulting in:
58 // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
59 // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
60 // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
61 // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
62 const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
63 const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
64 const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
65 const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
66
67 // Unpack 32 bit elements resulting in:
68 // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
69 // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
70 // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
71 // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
72 const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
73 const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
74 const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
75 const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
76
77 // Unpack 64 bit elements resulting in:
78 // out[0]: 00 10 20 30 40 50 60 70
79 // out[1]: 01 11 21 31 41 51 61 71
80 // out[2]: 02 12 22 32 42 52 62 72
81 // out[3]: 03 13 23 33 43 53 63 73
82 // out[4]: 04 14 24 34 44 54 64 74
83 // out[5]: 05 15 25 35 45 55 65 75
84 // out[6]: 06 16 26 36 46 56 66 76
85 // out[7]: 07 17 27 37 47 57 67 77
86 out[0] = _mm_unpacklo_epi64(c0, c0);
87 out[1] = _mm_unpackhi_epi64(c0, c0);
88 out[2] = _mm_unpacklo_epi64(c1, c1);
89 out[3] = _mm_unpackhi_epi64(c1, c1);
90 out[4] = _mm_unpacklo_epi64(c2, c2);
91 out[5] = _mm_unpackhi_epi64(c2, c2);
92 out[6] = _mm_unpacklo_epi64(c3, c3);
93 out[7] = _mm_unpackhi_epi64(c3, c3);
94 }
95
transpose_16bit_4x4(const __m128i * const in,__m128i * const out)96 static INLINE void transpose_16bit_4x4(const __m128i *const in,
97 __m128i *const out) {
98 // Unpack 16 bit elements. Goes from:
99 // in[0]: 00 01 02 03 XX XX XX XX
100 // in[1]: 10 11 12 13 XX XX XX XX
101 // in[2]: 20 21 22 23 XX XX XX XX
102 // in[3]: 30 31 32 33 XX XX XX XX
103 // to:
104 // a0: 00 10 01 11 02 12 03 13
105 // a1: 20 30 21 31 22 32 23 33
106 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
107 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
108
109 // Unpack 32 bit elements resulting in:
110 // out[0]: 00 10 20 30 01 11 21 31
111 // out[1]: 01 11 21 31 __ __ __ __
112 // out[2]: 02 12 22 32 03 13 23 33
113 // out[3]: 03 13 23 33 __ __ __ __
114 //
115 // Note: The high 64 bits of the output registers are shown for informational
116 // purposes only. Callers should only use the low 64 bits of the output
117 // registers. "__" indicates zeros.
118 out[0] = _mm_unpacklo_epi32(a0, a1);
119 out[1] = _mm_srli_si128(out[0], 8);
120 out[2] = _mm_unpackhi_epi32(a0, a1);
121 out[3] = _mm_srli_si128(out[2], 8);
122 }
123
transpose_16bit_4x8(const __m128i * const in,__m128i * const out)124 static INLINE void transpose_16bit_4x8(const __m128i *const in,
125 __m128i *const out) {
126 // Unpack 16 bit elements. Goes from:
127 // in[0]: 00 01 02 03 XX XX XX XX
128 // in[1]: 10 11 12 13 XX XX XX XX
129 // in[2]: 20 21 22 23 XX XX XX XX
130 // in[3]: 30 31 32 33 XX XX XX XX
131 // in[4]: 40 41 42 43 XX XX XX XX
132 // in[5]: 50 51 52 53 XX XX XX XX
133 // in[6]: 60 61 62 63 XX XX XX XX
134 // in[7]: 70 71 72 73 XX XX XX XX
135 // to:
136 // a0: 00 10 01 11 02 12 03 13
137 // a1: 20 30 21 31 22 32 23 33
138 // a2: 40 50 41 51 42 52 43 53
139 // a3: 60 70 61 71 62 72 63 73
140 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
141 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
142 const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
143 const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
144
145 // Unpack 32 bit elements resulting in:
146 // b0: 00 10 20 30 01 11 21 31
147 // b1: 40 50 60 70 41 51 61 71
148 // b2: 02 12 22 32 03 13 23 33
149 // b3: 42 52 62 72 43 53 63 73
150 const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
151 const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
152 const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
153 const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
154
155 // Unpack 64 bit elements resulting in:
156 // out[0]: 00 10 20 30 40 50 60 70
157 // out[1]: 01 11 21 31 41 51 61 71
158 // out[2]: 02 12 22 32 42 52 62 72
159 // out[3]: 03 13 23 33 43 53 63 73
160 out[0] = _mm_unpacklo_epi64(b0, b1);
161 out[1] = _mm_unpackhi_epi64(b0, b1);
162 out[2] = _mm_unpacklo_epi64(b2, b3);
163 out[3] = _mm_unpackhi_epi64(b2, b3);
164 }
165
transpose_16bit_8x4(const __m128i * const in,__m128i * const out)166 static INLINE void transpose_16bit_8x4(const __m128i *const in,
167 __m128i *const out) {
168 // Unpack 16 bit elements. Goes from:
169 // in[0]: 00 01 02 03 04 05 06 07
170 // in[1]: 10 11 12 13 14 15 16 17
171 // in[2]: 20 21 22 23 24 25 26 27
172 // in[3]: 30 31 32 33 34 35 36 37
173
174 // to:
175 // a0: 00 10 01 11 02 12 03 13
176 // a1: 20 30 21 31 22 32 23 33
177 // a4: 04 14 05 15 06 16 07 17
178 // a5: 24 34 25 35 26 36 27 37
179 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
180 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
181 const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
182 const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
183
184 // Unpack 32 bit elements resulting in:
185 // b0: 00 10 20 30 01 11 21 31
186 // b2: 04 14 24 34 05 15 25 35
187 // b4: 02 12 22 32 03 13 23 33
188 // b6: 06 16 26 36 07 17 27 37
189 const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
190 const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
191 const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
192 const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
193
194 // Unpack 64 bit elements resulting in:
195 // out[0]: 00 10 20 30 XX XX XX XX
196 // out[1]: 01 11 21 31 XX XX XX XX
197 // out[2]: 02 12 22 32 XX XX XX XX
198 // out[3]: 03 13 23 33 XX XX XX XX
199 // out[4]: 04 14 24 34 XX XX XX XX
200 // out[5]: 05 15 25 35 XX XX XX XX
201 // out[6]: 06 16 26 36 XX XX XX XX
202 // out[7]: 07 17 27 37 XX XX XX XX
203 const __m128i zeros = _mm_setzero_si128();
204 out[0] = _mm_unpacklo_epi64(b0, zeros);
205 out[1] = _mm_unpackhi_epi64(b0, zeros);
206 out[2] = _mm_unpacklo_epi64(b4, zeros);
207 out[3] = _mm_unpackhi_epi64(b4, zeros);
208 out[4] = _mm_unpacklo_epi64(b2, zeros);
209 out[5] = _mm_unpackhi_epi64(b2, zeros);
210 out[6] = _mm_unpacklo_epi64(b6, zeros);
211 out[7] = _mm_unpackhi_epi64(b6, zeros);
212 }
213
transpose_16bit_8x8(const __m128i * const in,__m128i * const out)214 static INLINE void transpose_16bit_8x8(const __m128i *const in,
215 __m128i *const out) {
216 // Unpack 16 bit elements. Goes from:
217 // in[0]: 00 01 02 03 04 05 06 07
218 // in[1]: 10 11 12 13 14 15 16 17
219 // in[2]: 20 21 22 23 24 25 26 27
220 // in[3]: 30 31 32 33 34 35 36 37
221 // in[4]: 40 41 42 43 44 45 46 47
222 // in[5]: 50 51 52 53 54 55 56 57
223 // in[6]: 60 61 62 63 64 65 66 67
224 // in[7]: 70 71 72 73 74 75 76 77
225 // to:
226 // a0: 00 10 01 11 02 12 03 13
227 // a1: 20 30 21 31 22 32 23 33
228 // a2: 40 50 41 51 42 52 43 53
229 // a3: 60 70 61 71 62 72 63 73
230 // a4: 04 14 05 15 06 16 07 17
231 // a5: 24 34 25 35 26 36 27 37
232 // a6: 44 54 45 55 46 56 47 57
233 // a7: 64 74 65 75 66 76 67 77
234 const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
235 const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
236 const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
237 const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
238 const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
239 const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
240 const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
241 const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
242
243 // Unpack 32 bit elements resulting in:
244 // b0: 00 10 20 30 01 11 21 31
245 // b1: 40 50 60 70 41 51 61 71
246 // b2: 04 14 24 34 05 15 25 35
247 // b3: 44 54 64 74 45 55 65 75
248 // b4: 02 12 22 32 03 13 23 33
249 // b5: 42 52 62 72 43 53 63 73
250 // b6: 06 16 26 36 07 17 27 37
251 // b7: 46 56 66 76 47 57 67 77
252 const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
253 const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
254 const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
255 const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
256 const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
257 const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
258 const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
259 const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
260
261 // Unpack 64 bit elements resulting in:
262 // out[0]: 00 10 20 30 40 50 60 70
263 // out[1]: 01 11 21 31 41 51 61 71
264 // out[2]: 02 12 22 32 42 52 62 72
265 // out[3]: 03 13 23 33 43 53 63 73
266 // out[4]: 04 14 24 34 44 54 64 74
267 // out[5]: 05 15 25 35 45 55 65 75
268 // out[6]: 06 16 26 36 46 56 66 76
269 // out[7]: 07 17 27 37 47 57 67 77
270 out[0] = _mm_unpacklo_epi64(b0, b1);
271 out[1] = _mm_unpackhi_epi64(b0, b1);
272 out[2] = _mm_unpacklo_epi64(b4, b5);
273 out[3] = _mm_unpackhi_epi64(b4, b5);
274 out[4] = _mm_unpacklo_epi64(b2, b3);
275 out[5] = _mm_unpackhi_epi64(b2, b3);
276 out[6] = _mm_unpacklo_epi64(b6, b7);
277 out[7] = _mm_unpackhi_epi64(b6, b7);
278 }
279
280 // Transpose in-place
transpose_16bit_16x16(__m128i * const left,__m128i * const right)281 static INLINE void transpose_16bit_16x16(__m128i *const left,
282 __m128i *const right) {
283 __m128i tbuf[8];
284 transpose_16bit_8x8(left, left);
285 transpose_16bit_8x8(right, tbuf);
286 transpose_16bit_8x8(left + 8, right);
287 transpose_16bit_8x8(right + 8, right + 8);
288
289 left[8] = tbuf[0];
290 left[9] = tbuf[1];
291 left[10] = tbuf[2];
292 left[11] = tbuf[3];
293 left[12] = tbuf[4];
294 left[13] = tbuf[5];
295 left[14] = tbuf[6];
296 left[15] = tbuf[7];
297 }
298
transpose_32bit_4x4(const __m128i * const in,__m128i * const out)299 static INLINE void transpose_32bit_4x4(const __m128i *const in,
300 __m128i *const out) {
301 // Unpack 32 bit elements. Goes from:
302 // in[0]: 00 01 02 03
303 // in[1]: 10 11 12 13
304 // in[2]: 20 21 22 23
305 // in[3]: 30 31 32 33
306 // to:
307 // a0: 00 10 01 11
308 // a1: 20 30 21 31
309 // a2: 02 12 03 13
310 // a3: 22 32 23 33
311
312 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
313 const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
314 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
315 const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
316
317 // Unpack 64 bit elements resulting in:
318 // out[0]: 00 10 20 30
319 // out[1]: 01 11 21 31
320 // out[2]: 02 12 22 32
321 // out[3]: 03 13 23 33
322 out[0] = _mm_unpacklo_epi64(a0, a1);
323 out[1] = _mm_unpackhi_epi64(a0, a1);
324 out[2] = _mm_unpacklo_epi64(a2, a3);
325 out[3] = _mm_unpackhi_epi64(a2, a3);
326 }
327
transpose_32bit_4x4x2(const __m128i * const in,__m128i * const out)328 static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
329 __m128i *const out) {
330 // Unpack 32 bit elements. Goes from:
331 // in[0]: 00 01 02 03
332 // in[1]: 10 11 12 13
333 // in[2]: 20 21 22 23
334 // in[3]: 30 31 32 33
335 // in[4]: 04 05 06 07
336 // in[5]: 14 15 16 17
337 // in[6]: 24 25 26 27
338 // in[7]: 34 35 36 37
339 // to:
340 // a0: 00 10 01 11
341 // a1: 20 30 21 31
342 // a2: 02 12 03 13
343 // a3: 22 32 23 33
344 // a4: 04 14 05 15
345 // a5: 24 34 25 35
346 // a6: 06 16 07 17
347 // a7: 26 36 27 37
348 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
349 const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
350 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
351 const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
352 const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
353 const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
354 const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
355 const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
356
357 // Unpack 64 bit elements resulting in:
358 // out[0]: 00 10 20 30
359 // out[1]: 01 11 21 31
360 // out[2]: 02 12 22 32
361 // out[3]: 03 13 23 33
362 // out[4]: 04 14 24 34
363 // out[5]: 05 15 25 35
364 // out[6]: 06 16 26 36
365 // out[7]: 07 17 27 37
366 out[0] = _mm_unpacklo_epi64(a0, a1);
367 out[1] = _mm_unpackhi_epi64(a0, a1);
368 out[2] = _mm_unpacklo_epi64(a2, a3);
369 out[3] = _mm_unpackhi_epi64(a2, a3);
370 out[4] = _mm_unpacklo_epi64(a4, a5);
371 out[5] = _mm_unpackhi_epi64(a4, a5);
372 out[6] = _mm_unpacklo_epi64(a6, a7);
373 out[7] = _mm_unpackhi_epi64(a6, a7);
374 }
375
transpose_32bit_8x4(const __m128i * const in,__m128i * const out)376 static INLINE void transpose_32bit_8x4(const __m128i *const in,
377 __m128i *const out) {
378 // Unpack 32 bit elements. Goes from:
379 // in[0]: 00 01 02 03
380 // in[1]: 04 05 06 07
381 // in[2]: 10 11 12 13
382 // in[3]: 14 15 16 17
383 // in[4]: 20 21 22 23
384 // in[5]: 24 25 26 27
385 // in[6]: 30 31 32 33
386 // in[7]: 34 35 36 37
387 // to:
388 // a0: 00 10 01 11
389 // a1: 20 30 21 31
390 // a2: 02 12 03 13
391 // a3: 22 32 23 33
392 // a4: 04 14 05 15
393 // a5: 24 34 25 35
394 // a6: 06 16 07 17
395 // a7: 26 36 27 37
396 const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
397 const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
398 const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
399 const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
400 const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
401 const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
402 const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
403 const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
404
405 // Unpack 64 bit elements resulting in:
406 // out[0]: 00 10 20 30
407 // out[1]: 01 11 21 31
408 // out[2]: 02 12 22 32
409 // out[3]: 03 13 23 33
410 // out[4]: 04 14 24 34
411 // out[5]: 05 15 25 35
412 // out[6]: 06 16 26 36
413 // out[7]: 07 17 27 37
414 out[0] = _mm_unpacklo_epi64(a0, a1);
415 out[1] = _mm_unpackhi_epi64(a0, a1);
416 out[2] = _mm_unpacklo_epi64(a2, a3);
417 out[3] = _mm_unpackhi_epi64(a2, a3);
418 out[4] = _mm_unpacklo_epi64(a4, a5);
419 out[5] = _mm_unpackhi_epi64(a4, a5);
420 out[6] = _mm_unpacklo_epi64(a6, a7);
421 out[7] = _mm_unpackhi_epi64(a6, a7);
422 }
423
424 #endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
425