1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
13 #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
14
15 #include <emmintrin.h> // SSE2
16
17 #include "config/aom_config.h"
18
highbd_transpose6x6_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5)19 static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
20 __m128i *x2, __m128i *x3,
21 __m128i *x4, __m128i *x5,
22 __m128i *d0, __m128i *d1,
23 __m128i *d2, __m128i *d3,
24 __m128i *d4, __m128i *d5) {
25 __m128i w0, w1, w2, w3, w4, w5, ww0;
26
27 // 00 01 02 03 04 05 xx xx
28 // 10 11 12 13 14 15 xx xx
29 // 20 21 22 23 24 25 xx xx
30 // 30 31 32 33 34 35 xx xx
31 // 40 41 42 43 44 45 xx xx
32 // 50 51 52 53 54 55 xx xx
33
34 w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
35 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
36 w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
37
38 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
39 *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51
40 *d1 = _mm_unpackhi_epi64(ww0,
41 _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx
42
43 ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
44 *d2 = _mm_unpacklo_epi64(ww0,
45 _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx
46
47 w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx
48 w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx
49 w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx
50
51 *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53
52
53 ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35
54 *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55
55 *d5 = _mm_unpackhi_epi64(ww0,
56 _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx
57 }
58
highbd_transpose4x8_8x4_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)59 static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
60 __m128i *x2, __m128i *x3,
61 __m128i *d0, __m128i *d1,
62 __m128i *d2, __m128i *d3) {
63 __m128i zero = _mm_setzero_si128();
64 __m128i w0, w1, ww0, ww1;
65
66 w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
67 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
68
69 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
70 ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
71
72 *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx
73 *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx
74 *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx
75 *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx
76 }
77
highbd_transpose4x8_8x4_high_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)78 static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
79 __m128i *x2, __m128i *x3,
80 __m128i *d4, __m128i *d5,
81 __m128i *d6, __m128i *d7) {
82 __m128i w0, w1, ww2, ww3;
83 __m128i zero = _mm_setzero_si128();
84
85 w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
86 w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
87
88 ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
89 ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
90
91 *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx
92 *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx
93 *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx
94 *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx
95 }
96
97 // here in and out pointers (x and d) should be different! we don't store their
98 // values inside
highbd_transpose4x8_8x4_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)99 static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
100 __m128i *x2, __m128i *x3,
101 __m128i *d0, __m128i *d1,
102 __m128i *d2, __m128i *d3,
103 __m128i *d4, __m128i *d5,
104 __m128i *d6, __m128i *d7) {
105 // input
106 // x0 00 01 02 03 04 05 06 07
107 // x1 10 11 12 13 14 15 16 17
108 // x2 20 21 22 23 24 25 26 27
109 // x3 30 31 32 33 34 35 36 37
110 // output
111 // 00 10 20 30 xx xx xx xx
112 // 01 11 21 31 xx xx xx xx
113 // 02 12 22 32 xx xx xx xx
114 // 03 13 23 33 xx xx xx xx
115 // 04 14 24 34 xx xx xx xx
116 // 05 15 25 35 xx xx xx xx
117 // 06 16 26 36 xx xx xx xx
118 // 07 17 27 37 xx xx xx xx
119 highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
120 highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
121 }
122
highbd_transpose8x8_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)123 static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
124 __m128i *x2, __m128i *x3,
125 __m128i *x4, __m128i *x5,
126 __m128i *x6, __m128i *x7,
127 __m128i *d0, __m128i *d1,
128 __m128i *d2, __m128i *d3) {
129 __m128i w0, w1, w2, w3, ww0, ww1;
130 // x0 00 01 02 03 04 05 06 07
131 // x1 10 11 12 13 14 15 16 17
132 // x2 20 21 22 23 24 25 26 27
133 // x3 30 31 32 33 34 35 36 37
134 // x4 40 41 42 43 44 45 46 47
135 // x5 50 51 52 53 54 55 56 57
136 // x6 60 61 62 63 64 65 66 67
137 // x7 70 71 72 73 74 75 76 77
138
139 w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
140 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
141 w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
142 w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73
143
144 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
145 ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
146
147 *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
148 *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
149
150 ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
151 ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
152
153 *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
154 *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
155 }
156
highbd_transpose8x8_high_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)157 static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
158 __m128i *x2, __m128i *x3,
159 __m128i *x4, __m128i *x5,
160 __m128i *x6, __m128i *x7,
161 __m128i *d4, __m128i *d5,
162 __m128i *d6, __m128i *d7) {
163 __m128i w0, w1, w2, w3, ww0, ww1;
164 // x0 00 01 02 03 04 05 06 07
165 // x1 10 11 12 13 14 15 16 17
166 // x2 20 21 22 23 24 25 26 27
167 // x3 30 31 32 33 34 35 36 37
168 // x4 40 41 42 43 44 45 46 47
169 // x5 50 51 52 53 54 55 56 57
170 // x6 60 61 62 63 64 65 66 67
171 // x7 70 71 72 73 74 75 76 77
172 w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
173 w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
174 w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57
175 w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77
176
177 ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
178 ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
179
180 *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
181 *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
182
183 ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
184 ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
185
186 *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
187 *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
188 }
189
190 // here in and out pointers (x and d) should be different! we don't store their
191 // values inside
highbd_transpose8x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)192 static INLINE void highbd_transpose8x8_sse2(
193 __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
194 __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
195 __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
196 __m128i *d7) {
197 highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
198 highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
199 }
200
201 // here in and out pointers (x and d arrays) should be different! we don't store
202 // their values inside
highbd_transpose8x16_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)203 static INLINE void highbd_transpose8x16_sse2(
204 __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
205 __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
206 __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
207 __m128i *d7) {
208 highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
209 d5, d6, d7);
210 highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
211 x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
212 d4 + 1, d5 + 1, d6 + 1, d7 + 1);
213 }
214
215 // Low bit depth functions
transpose4x8_8x4_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)216 static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
217 __m128i *x2, __m128i *x3,
218 __m128i *d0, __m128i *d1,
219 __m128i *d2, __m128i *d3) {
220 // input
221 // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
222 // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
223 // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
224 // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
225 // output
226 // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
227 // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
228 // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
229 // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
230
231 __m128i w0, w1;
232
233 w0 = _mm_unpacklo_epi8(
234 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
235 w1 = _mm_unpacklo_epi8(
236 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
237
238 *d0 = _mm_unpacklo_epi16(
239 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
240
241 *d1 = _mm_srli_si128(*d0,
242 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
243 *d2 = _mm_srli_si128(*d0,
244 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
245 *d3 = _mm_srli_si128(*d0,
246 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
247 }
248
transpose4x8_8x4_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)249 static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
250 __m128i *x3, __m128i *d0, __m128i *d1,
251 __m128i *d2, __m128i *d3, __m128i *d4,
252 __m128i *d5, __m128i *d6,
253 __m128i *d7) {
254 // input
255 // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
256 // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
257 // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
258 // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
259 // output
260 // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
261 // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
262 // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
263 // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
264 // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
265 // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
266 // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
267 // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
268
269 __m128i w0, w1, ww0, ww1;
270
271 w0 = _mm_unpacklo_epi8(
272 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
273 w1 = _mm_unpacklo_epi8(
274 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
275
276 ww0 = _mm_unpacklo_epi16(
277 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
278 ww1 = _mm_unpackhi_epi16(
279 w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
280
281 *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
282 *d1 = _mm_srli_si128(ww0,
283 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
284 *d2 = _mm_srli_si128(ww0,
285 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
286 *d3 = _mm_srli_si128(ww0,
287 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
288
289 *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
290 *d5 = _mm_srli_si128(ww1,
291 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
292 *d6 = _mm_srli_si128(ww1,
293 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
294 *d7 = _mm_srli_si128(ww1,
295 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
296 }
297
transpose8x8_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)298 static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
299 __m128i *x3, __m128i *x4, __m128i *x5,
300 __m128i *x6, __m128i *x7, __m128i *d0,
301 __m128i *d1, __m128i *d2,
302 __m128i *d3) {
303 // input
304 // x0 00 01 02 03 04 05 06 07
305 // x1 10 11 12 13 14 15 16 17
306 // x2 20 21 22 23 24 25 26 27
307 // x3 30 31 32 33 34 35 36 37
308 // x4 40 41 42 43 44 45 46 47
309 // x5 50 51 52 53 54 55 56 57
310 // x6 60 61 62 63 64 65 66 67
311 // x7 70 71 72 73 74 75 76 77
312 // output
313 // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
314 // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
315 // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
316 // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
317
318 __m128i w0, w1, w2, w3, w4, w5;
319
320 w0 = _mm_unpacklo_epi8(
321 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
322
323 w1 = _mm_unpacklo_epi8(
324 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
325
326 w2 = _mm_unpacklo_epi8(
327 *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
328
329 w3 = _mm_unpacklo_epi8(
330 *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
331
332 w4 = _mm_unpacklo_epi16(
333 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
334 w5 = _mm_unpacklo_epi16(
335 w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
336
337 *d0 = _mm_unpacklo_epi32(
338 w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
339 *d1 = _mm_srli_si128(*d0, 8);
340 *d2 = _mm_unpackhi_epi32(
341 w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
342 *d3 = _mm_srli_si128(*d2, 8);
343 }
344
transpose8x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0d1,__m128i * d2d3,__m128i * d4d5,__m128i * d6d7)345 static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
346 __m128i *x3, __m128i *x4, __m128i *x5,
347 __m128i *x6, __m128i *x7, __m128i *d0d1,
348 __m128i *d2d3, __m128i *d4d5,
349 __m128i *d6d7) {
350 __m128i w0, w1, w2, w3, w4, w5, w6, w7;
351 // x0 00 01 02 03 04 05 06 07
352 // x1 10 11 12 13 14 15 16 17
353 w0 = _mm_unpacklo_epi8(
354 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
355
356 // x2 20 21 22 23 24 25 26 27
357 // x3 30 31 32 33 34 35 36 37
358 w1 = _mm_unpacklo_epi8(
359 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
360
361 // x4 40 41 42 43 44 45 46 47
362 // x5 50 51 52 53 54 55 56 57
363 w2 = _mm_unpacklo_epi8(
364 *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
365
366 // x6 60 61 62 63 64 65 66 67
367 // x7 70 71 72 73 74 75 76 77
368 w3 = _mm_unpacklo_epi8(
369 *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
370
371 w4 = _mm_unpacklo_epi16(
372 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
373 w5 = _mm_unpacklo_epi16(
374 w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
375
376 *d0d1 = _mm_unpacklo_epi32(
377 w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
378 *d2d3 = _mm_unpackhi_epi32(
379 w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
380
381 w6 = _mm_unpackhi_epi16(
382 w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
383 w7 = _mm_unpackhi_epi16(
384 w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
385
386 *d4d5 = _mm_unpacklo_epi32(
387 w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
388 *d6d7 = _mm_unpackhi_epi32(
389 w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
390 }
391
transpose16x8_8x16_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * x8,__m128i * x9,__m128i * x10,__m128i * x11,__m128i * x12,__m128i * x13,__m128i * x14,__m128i * x15,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)392 static INLINE void transpose16x8_8x16_sse2(
393 __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
394 __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
395 __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
396 __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
397 __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
398 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
399 __m128i w10, w11, w12, w13, w14, w15;
400
401 w0 = _mm_unpacklo_epi8(*x0, *x1);
402 w1 = _mm_unpacklo_epi8(*x2, *x3);
403 w2 = _mm_unpacklo_epi8(*x4, *x5);
404 w3 = _mm_unpacklo_epi8(*x6, *x7);
405
406 w8 = _mm_unpacklo_epi8(*x8, *x9);
407 w9 = _mm_unpacklo_epi8(*x10, *x11);
408 w10 = _mm_unpacklo_epi8(*x12, *x13);
409 w11 = _mm_unpacklo_epi8(*x14, *x15);
410
411 w4 = _mm_unpacklo_epi16(w0, w1);
412 w5 = _mm_unpacklo_epi16(w2, w3);
413 w12 = _mm_unpacklo_epi16(w8, w9);
414 w13 = _mm_unpacklo_epi16(w10, w11);
415
416 w6 = _mm_unpacklo_epi32(w4, w5);
417 w7 = _mm_unpackhi_epi32(w4, w5);
418 w14 = _mm_unpacklo_epi32(w12, w13);
419 w15 = _mm_unpackhi_epi32(w12, w13);
420
421 // Store first 4-line result
422 *d0 = _mm_unpacklo_epi64(w6, w14);
423 *d1 = _mm_unpackhi_epi64(w6, w14);
424 *d2 = _mm_unpacklo_epi64(w7, w15);
425 *d3 = _mm_unpackhi_epi64(w7, w15);
426
427 w4 = _mm_unpackhi_epi16(w0, w1);
428 w5 = _mm_unpackhi_epi16(w2, w3);
429 w12 = _mm_unpackhi_epi16(w8, w9);
430 w13 = _mm_unpackhi_epi16(w10, w11);
431
432 w6 = _mm_unpacklo_epi32(w4, w5);
433 w7 = _mm_unpackhi_epi32(w4, w5);
434 w14 = _mm_unpacklo_epi32(w12, w13);
435 w15 = _mm_unpackhi_epi32(w12, w13);
436
437 // Store second 4-line result
438 *d4 = _mm_unpacklo_epi64(w6, w14);
439 *d5 = _mm_unpackhi_epi64(w6, w14);
440 *d6 = _mm_unpacklo_epi64(w7, w15);
441 *d7 = _mm_unpackhi_epi64(w7, w15);
442 }
443
transpose8x16_16x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0d1,__m128i * d2d3,__m128i * d4d5,__m128i * d6d7,__m128i * d8d9,__m128i * d10d11,__m128i * d12d13,__m128i * d14d15)444 static INLINE void transpose8x16_16x8_sse2(
445 __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
446 __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
447 __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
448 __m128i *d12d13, __m128i *d14d15) {
449 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
450 __m128i w10, w11, w12, w13, w14, w15;
451
452 w0 = _mm_unpacklo_epi8(*x0, *x1);
453 w1 = _mm_unpacklo_epi8(*x2, *x3);
454 w2 = _mm_unpacklo_epi8(*x4, *x5);
455 w3 = _mm_unpacklo_epi8(*x6, *x7);
456
457 w8 = _mm_unpackhi_epi8(*x0, *x1);
458 w9 = _mm_unpackhi_epi8(*x2, *x3);
459 w10 = _mm_unpackhi_epi8(*x4, *x5);
460 w11 = _mm_unpackhi_epi8(*x6, *x7);
461
462 w4 = _mm_unpacklo_epi16(w0, w1);
463 w5 = _mm_unpacklo_epi16(w2, w3);
464 w12 = _mm_unpacklo_epi16(w8, w9);
465 w13 = _mm_unpacklo_epi16(w10, w11);
466
467 w6 = _mm_unpacklo_epi32(w4, w5);
468 w7 = _mm_unpackhi_epi32(w4, w5);
469 w14 = _mm_unpacklo_epi32(w12, w13);
470 w15 = _mm_unpackhi_epi32(w12, w13);
471
472 // Store first 4-line result
473 *d0d1 = _mm_unpacklo_epi64(w6, w14);
474 *d2d3 = _mm_unpackhi_epi64(w6, w14);
475 *d4d5 = _mm_unpacklo_epi64(w7, w15);
476 *d6d7 = _mm_unpackhi_epi64(w7, w15);
477
478 w4 = _mm_unpackhi_epi16(w0, w1);
479 w5 = _mm_unpackhi_epi16(w2, w3);
480 w12 = _mm_unpackhi_epi16(w8, w9);
481 w13 = _mm_unpackhi_epi16(w10, w11);
482
483 w6 = _mm_unpacklo_epi32(w4, w5);
484 w7 = _mm_unpackhi_epi32(w4, w5);
485 w14 = _mm_unpacklo_epi32(w12, w13);
486 w15 = _mm_unpackhi_epi32(w12, w13);
487
488 // Store second 4-line result
489 *d8d9 = _mm_unpacklo_epi64(w6, w14);
490 *d10d11 = _mm_unpackhi_epi64(w6, w14);
491 *d12d13 = _mm_unpacklo_epi64(w7, w15);
492 *d14d15 = _mm_unpackhi_epi64(w7, w15);
493 }
494
495 #endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
496