• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
13 #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
14 
15 #include <emmintrin.h>  // SSE2
16 
17 #include "config/aom_config.h"
18 
19 #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
20 #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
21 
highbd_transpose6x6_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5)22 static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
23                                             __m128i *x2, __m128i *x3,
24                                             __m128i *x4, __m128i *x5,
25                                             __m128i *d0, __m128i *d1,
26                                             __m128i *d2, __m128i *d3,
27                                             __m128i *d4, __m128i *d5) {
28   __m128i w0, w1, w2, w3, w4, w5, ww0;
29 
30   // 00 01 02 03 04 05 xx xx
31   // 10 11 12 13 14 15 xx xx
32   // 20 21 22 23 24 25 xx xx
33   // 30 31 32 33 34 35 xx xx
34   // 40 41 42 43 44 45 xx xx
35   // 50 51 52 53 54 55 xx xx
36 
37   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
38   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
39   w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
40 
41   ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
42   *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
43   *d1 = _mm_unpackhi_epi64(ww0,
44                            _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
45 
46   ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
47   *d2 = _mm_unpacklo_epi64(ww0,
48                            _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
49 
50   w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
51   w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
52   w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
53 
54   *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
55 
56   ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
57   *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
58   *d5 = _mm_unpackhi_epi64(ww0,
59                            _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
60 }
61 
highbd_transpose4x8_8x4_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)62 static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
63                                                     __m128i *x2, __m128i *x3,
64                                                     __m128i *d0, __m128i *d1,
65                                                     __m128i *d2, __m128i *d3) {
66   __m128i zero = _mm_setzero_si128();
67   __m128i w0, w1, ww0, ww1;
68 
69   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
70   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
71 
72   ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
73   ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
74 
75   *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
76   *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
77   *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
78   *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
79 }
80 
highbd_transpose4x8_8x4_high_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)81 static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
82                                                      __m128i *x2, __m128i *x3,
83                                                      __m128i *d4, __m128i *d5,
84                                                      __m128i *d6, __m128i *d7) {
85   __m128i w0, w1, ww2, ww3;
86   __m128i zero = _mm_setzero_si128();
87 
88   w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
89   w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
90 
91   ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
92   ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
93 
94   *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
95   *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
96   *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
97   *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
98 }
99 
100 // here in and out pointers (x and d) should be different! we don't store their
101 // values inside
highbd_transpose4x8_8x4_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)102 static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
103                                                 __m128i *x2, __m128i *x3,
104                                                 __m128i *d0, __m128i *d1,
105                                                 __m128i *d2, __m128i *d3,
106                                                 __m128i *d4, __m128i *d5,
107                                                 __m128i *d6, __m128i *d7) {
108   // input
109   // x0 00 01 02 03 04 05 06 07
110   // x1 10 11 12 13 14 15 16 17
111   // x2 20 21 22 23 24 25 26 27
112   // x3 30 31 32 33 34 35 36 37
113   // output
114   // 00 10 20 30 xx xx xx xx
115   // 01 11 21 31 xx xx xx xx
116   // 02 12 22 32 xx xx xx xx
117   // 03 13 23 33 xx xx xx xx
118   // 04 14 24 34 xx xx xx xx
119   // 05 15 25 35 xx xx xx xx
120   // 06 16 26 36 xx xx xx xx
121   // 07 17 27 37 xx xx xx xx
122   highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
123   highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
124 }
125 
highbd_transpose8x8_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)126 static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
127                                                 __m128i *x2, __m128i *x3,
128                                                 __m128i *x4, __m128i *x5,
129                                                 __m128i *x6, __m128i *x7,
130                                                 __m128i *d0, __m128i *d1,
131                                                 __m128i *d2, __m128i *d3) {
132   __m128i w0, w1, w2, w3, ww0, ww1;
133   // x0 00 01 02 03 04 05 06 07
134   // x1 10 11 12 13 14 15 16 17
135   // x2 20 21 22 23 24 25 26 27
136   // x3 30 31 32 33 34 35 36 37
137   // x4 40 41 42 43 44 45 46 47
138   // x5 50 51 52 53 54 55 56 57
139   // x6 60 61 62 63 64 65 66 67
140   // x7 70 71 72 73 74 75 76 77
141 
142   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
143   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
144   w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
145   w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
146 
147   ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
148   ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
149 
150   *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
151   *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
152 
153   ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
154   ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
155 
156   *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
157   *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
158 }
159 
highbd_transpose8x8_high_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)160 static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
161                                                  __m128i *x2, __m128i *x3,
162                                                  __m128i *x4, __m128i *x5,
163                                                  __m128i *x6, __m128i *x7,
164                                                  __m128i *d4, __m128i *d5,
165                                                  __m128i *d6, __m128i *d7) {
166   __m128i w0, w1, w2, w3, ww0, ww1;
167   // x0 00 01 02 03 04 05 06 07
168   // x1 10 11 12 13 14 15 16 17
169   // x2 20 21 22 23 24 25 26 27
170   // x3 30 31 32 33 34 35 36 37
171   // x4 40 41 42 43 44 45 46 47
172   // x5 50 51 52 53 54 55 56 57
173   // x6 60 61 62 63 64 65 66 67
174   // x7 70 71 72 73 74 75 76 77
175   w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
176   w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
177   w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
178   w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
179 
180   ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
181   ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
182 
183   *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
184   *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
185 
186   ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
187   ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
188 
189   *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
190   *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
191 }
192 
193 // here in and out pointers (x and d) should be different! we don't store their
194 // values inside
highbd_transpose8x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)195 static INLINE void highbd_transpose8x8_sse2(
196     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
197     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
198     __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
199     __m128i *d7) {
200   highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
201   highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
202 }
203 
204 // here in and out pointers (x and d arrays) should be different! we don't store
205 // their values inside
highbd_transpose8x16_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)206 static INLINE void highbd_transpose8x16_sse2(
207     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
208     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
209     __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
210     __m128i *d7) {
211   highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
212                            d5, d6, d7);
213   highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
214                            x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
215                            d4 + 1, d5 + 1, d6 + 1, d7 + 1);
216 }
217 
218 // Low bit depth functions
transpose4x8_8x4_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)219 static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
220                                              __m128i *x2, __m128i *x3,
221                                              __m128i *d0, __m128i *d1,
222                                              __m128i *d2, __m128i *d3) {
223   // input
224   // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
225   // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
226   // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
227   // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
228   // output
229   // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
230   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
231   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
232   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
233 
234   __m128i w0, w1;
235 
236   w0 = _mm_unpacklo_epi8(
237       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
238   w1 = _mm_unpacklo_epi8(
239       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
240 
241   *d0 = _mm_unpacklo_epi16(
242       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
243 
244   *d1 = _mm_srli_si128(*d0,
245                        4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
246   *d2 = _mm_srli_si128(*d0,
247                        8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
248   *d3 = _mm_srli_si128(*d0,
249                        12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
250 }
251 
transpose4x8_8x4_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)252 static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
253                                          __m128i *x3, __m128i *d0, __m128i *d1,
254                                          __m128i *d2, __m128i *d3, __m128i *d4,
255                                          __m128i *d5, __m128i *d6,
256                                          __m128i *d7) {
257   // input
258   // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
259   // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
260   // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
261   // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
262   // output
263   // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
264   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
265   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
266   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
267   // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
268   // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
269   // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
270   // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
271 
272   __m128i w0, w1, ww0, ww1;
273 
274   w0 = _mm_unpacklo_epi8(
275       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
276   w1 = _mm_unpacklo_epi8(
277       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
278 
279   ww0 = _mm_unpacklo_epi16(
280       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
281   ww1 = _mm_unpackhi_epi16(
282       w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
283 
284   *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
285   *d1 = _mm_srli_si128(ww0,
286                        4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
287   *d2 = _mm_srli_si128(ww0,
288                        8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
289   *d3 = _mm_srli_si128(ww0,
290                        12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
291 
292   *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
293   *d5 = _mm_srli_si128(ww1,
294                        4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
295   *d6 = _mm_srli_si128(ww1,
296                        8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
297   *d7 = _mm_srli_si128(ww1,
298                        12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
299 }
300 
transpose8x8_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)301 static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
302                                          __m128i *x3, __m128i *x4, __m128i *x5,
303                                          __m128i *x6, __m128i *x7, __m128i *d0,
304                                          __m128i *d1, __m128i *d2,
305                                          __m128i *d3) {
306   // input
307   // x0 00 01 02 03 04 05 06 07
308   // x1 10 11 12 13 14 15 16 17
309   // x2 20 21 22 23 24 25 26 27
310   // x3 30 31 32 33 34 35 36 37
311   // x4 40 41 42 43 44 45 46 47
312   // x5  50 51 52 53 54 55 56 57
313   // x6  60 61 62 63 64 65 66 67
314   // x7 70 71 72 73 74 75 76 77
315   // output
316   // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
317   // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
318   // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
319   // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
320 
321   __m128i w0, w1, w2, w3, w4, w5;
322 
323   w0 = _mm_unpacklo_epi8(
324       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
325 
326   w1 = _mm_unpacklo_epi8(
327       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
328 
329   w2 = _mm_unpacklo_epi8(
330       *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
331 
332   w3 = _mm_unpacklo_epi8(
333       *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
334 
335   w4 = _mm_unpacklo_epi16(
336       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
337   w5 = _mm_unpacklo_epi16(
338       w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
339 
340   *d0 = _mm_unpacklo_epi32(
341       w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
342   *d1 = _mm_srli_si128(*d0, 8);
343   *d2 = _mm_unpackhi_epi32(
344       w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
345   *d3 = _mm_srli_si128(*d2, 8);
346 }
347 
transpose8x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0d1,__m128i * d2d3,__m128i * d4d5,__m128i * d6d7)348 static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
349                                      __m128i *x3, __m128i *x4, __m128i *x5,
350                                      __m128i *x6, __m128i *x7, __m128i *d0d1,
351                                      __m128i *d2d3, __m128i *d4d5,
352                                      __m128i *d6d7) {
353   __m128i w0, w1, w2, w3, w4, w5, w6, w7;
354   // x0 00 01 02 03 04 05 06 07
355   // x1 10 11 12 13 14 15 16 17
356   w0 = _mm_unpacklo_epi8(
357       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
358 
359   // x2 20 21 22 23 24 25 26 27
360   // x3 30 31 32 33 34 35 36 37
361   w1 = _mm_unpacklo_epi8(
362       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
363 
364   // x4 40 41 42 43 44 45 46 47
365   // x5  50 51 52 53 54 55 56 57
366   w2 = _mm_unpacklo_epi8(
367       *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
368 
369   // x6  60 61 62 63 64 65 66 67
370   // x7 70 71 72 73 74 75 76 77
371   w3 = _mm_unpacklo_epi8(
372       *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
373 
374   w4 = _mm_unpacklo_epi16(
375       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
376   w5 = _mm_unpacklo_epi16(
377       w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
378 
379   *d0d1 = _mm_unpacklo_epi32(
380       w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
381   *d2d3 = _mm_unpackhi_epi32(
382       w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
383 
384   w6 = _mm_unpackhi_epi16(
385       w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
386   w7 = _mm_unpackhi_epi16(
387       w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
388 
389   *d4d5 = _mm_unpacklo_epi32(
390       w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
391   *d6d7 = _mm_unpackhi_epi32(
392       w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
393 }
394 
transpose16x8_8x16_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * x8,__m128i * x9,__m128i * x10,__m128i * x11,__m128i * x12,__m128i * x13,__m128i * x14,__m128i * x15,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)395 static INLINE void transpose16x8_8x16_sse2(
396     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
397     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
398     __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
399     __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
400     __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
401   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
402   __m128i w10, w11, w12, w13, w14, w15;
403 
404   w0 = _mm_unpacklo_epi8(*x0, *x1);
405   w1 = _mm_unpacklo_epi8(*x2, *x3);
406   w2 = _mm_unpacklo_epi8(*x4, *x5);
407   w3 = _mm_unpacklo_epi8(*x6, *x7);
408 
409   w8 = _mm_unpacklo_epi8(*x8, *x9);
410   w9 = _mm_unpacklo_epi8(*x10, *x11);
411   w10 = _mm_unpacklo_epi8(*x12, *x13);
412   w11 = _mm_unpacklo_epi8(*x14, *x15);
413 
414   w4 = _mm_unpacklo_epi16(w0, w1);
415   w5 = _mm_unpacklo_epi16(w2, w3);
416   w12 = _mm_unpacklo_epi16(w8, w9);
417   w13 = _mm_unpacklo_epi16(w10, w11);
418 
419   w6 = _mm_unpacklo_epi32(w4, w5);
420   w7 = _mm_unpackhi_epi32(w4, w5);
421   w14 = _mm_unpacklo_epi32(w12, w13);
422   w15 = _mm_unpackhi_epi32(w12, w13);
423 
424   // Store first 4-line result
425   *d0 = _mm_unpacklo_epi64(w6, w14);
426   *d1 = _mm_unpackhi_epi64(w6, w14);
427   *d2 = _mm_unpacklo_epi64(w7, w15);
428   *d3 = _mm_unpackhi_epi64(w7, w15);
429 
430   w4 = _mm_unpackhi_epi16(w0, w1);
431   w5 = _mm_unpackhi_epi16(w2, w3);
432   w12 = _mm_unpackhi_epi16(w8, w9);
433   w13 = _mm_unpackhi_epi16(w10, w11);
434 
435   w6 = _mm_unpacklo_epi32(w4, w5);
436   w7 = _mm_unpackhi_epi32(w4, w5);
437   w14 = _mm_unpacklo_epi32(w12, w13);
438   w15 = _mm_unpackhi_epi32(w12, w13);
439 
440   // Store second 4-line result
441   *d4 = _mm_unpacklo_epi64(w6, w14);
442   *d5 = _mm_unpackhi_epi64(w6, w14);
443   *d6 = _mm_unpacklo_epi64(w7, w15);
444   *d7 = _mm_unpackhi_epi64(w7, w15);
445 }
446 
transpose8x16_16x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0d1,__m128i * d2d3,__m128i * d4d5,__m128i * d6d7,__m128i * d8d9,__m128i * d10d11,__m128i * d12d13,__m128i * d14d15)447 static INLINE void transpose8x16_16x8_sse2(
448     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
449     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
450     __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
451     __m128i *d12d13, __m128i *d14d15) {
452   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
453   __m128i w10, w11, w12, w13, w14, w15;
454 
455   w0 = _mm_unpacklo_epi8(*x0, *x1);
456   w1 = _mm_unpacklo_epi8(*x2, *x3);
457   w2 = _mm_unpacklo_epi8(*x4, *x5);
458   w3 = _mm_unpacklo_epi8(*x6, *x7);
459 
460   w8 = _mm_unpackhi_epi8(*x0, *x1);
461   w9 = _mm_unpackhi_epi8(*x2, *x3);
462   w10 = _mm_unpackhi_epi8(*x4, *x5);
463   w11 = _mm_unpackhi_epi8(*x6, *x7);
464 
465   w4 = _mm_unpacklo_epi16(w0, w1);
466   w5 = _mm_unpacklo_epi16(w2, w3);
467   w12 = _mm_unpacklo_epi16(w8, w9);
468   w13 = _mm_unpacklo_epi16(w10, w11);
469 
470   w6 = _mm_unpacklo_epi32(w4, w5);
471   w7 = _mm_unpackhi_epi32(w4, w5);
472   w14 = _mm_unpacklo_epi32(w12, w13);
473   w15 = _mm_unpackhi_epi32(w12, w13);
474 
475   // Store first 4-line result
476   *d0d1 = _mm_unpacklo_epi64(w6, w14);
477   *d2d3 = _mm_unpackhi_epi64(w6, w14);
478   *d4d5 = _mm_unpacklo_epi64(w7, w15);
479   *d6d7 = _mm_unpackhi_epi64(w7, w15);
480 
481   w4 = _mm_unpackhi_epi16(w0, w1);
482   w5 = _mm_unpackhi_epi16(w2, w3);
483   w12 = _mm_unpackhi_epi16(w8, w9);
484   w13 = _mm_unpackhi_epi16(w10, w11);
485 
486   w6 = _mm_unpacklo_epi32(w4, w5);
487   w7 = _mm_unpackhi_epi32(w4, w5);
488   w14 = _mm_unpacklo_epi32(w12, w13);
489   w15 = _mm_unpackhi_epi32(w12, w13);
490 
491   // Store second 4-line result
492   *d8d9 = _mm_unpacklo_epi64(w6, w14);
493   *d10d11 = _mm_unpackhi_epi64(w6, w14);
494   *d12d13 = _mm_unpacklo_epi64(w7, w15);
495   *d14d15 = _mm_unpackhi_epi64(w7, w15);
496 }
497 
transpose_16x8(unsigned char * in0,unsigned char * in1,int in_p,unsigned char * out,int out_p)498 static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1,
499                                   int in_p, unsigned char *out, int out_p) {
500   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
501   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
502 
503   x0 = _mm_loadl_epi64((__m128i *)in0);
504   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
505   x0 = _mm_unpacklo_epi8(x0, x1);
506 
507   x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
508   x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));
509   x1 = _mm_unpacklo_epi8(x2, x3);
510 
511   x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));
512   x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));
513   x2 = _mm_unpacklo_epi8(x4, x5);
514 
515   x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));
516   x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));
517   x3 = _mm_unpacklo_epi8(x6, x7);
518   x4 = _mm_unpacklo_epi16(x0, x1);
519 
520   x8 = _mm_loadl_epi64((__m128i *)in1);
521   x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
522   x8 = _mm_unpacklo_epi8(x8, x9);
523   x5 = _mm_unpacklo_epi16(x2, x3);
524 
525   x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
526   x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));
527   x9 = _mm_unpacklo_epi8(x10, x11);
528 
529   x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));
530   x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));
531   x10 = _mm_unpacklo_epi8(x12, x13);
532   x12 = _mm_unpacklo_epi16(x8, x9);
533 
534   x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));
535   x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));
536   x11 = _mm_unpacklo_epi8(x14, x15);
537   x13 = _mm_unpacklo_epi16(x10, x11);
538 
539   x6 = _mm_unpacklo_epi32(x4, x5);
540   x7 = _mm_unpackhi_epi32(x4, x5);
541   x14 = _mm_unpacklo_epi32(x12, x13);
542   x15 = _mm_unpackhi_epi32(x12, x13);
543 
544   // Store first 4-line result
545   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
546   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
547   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
548   _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
549 
550   x4 = _mm_unpackhi_epi16(x0, x1);
551   x5 = _mm_unpackhi_epi16(x2, x3);
552   x12 = _mm_unpackhi_epi16(x8, x9);
553   x13 = _mm_unpackhi_epi16(x10, x11);
554 
555   x6 = _mm_unpacklo_epi32(x4, x5);
556   x7 = _mm_unpackhi_epi32(x4, x5);
557   x14 = _mm_unpacklo_epi32(x12, x13);
558   x15 = _mm_unpackhi_epi32(x12, x13);
559 
560   // Store second 4-line result
561   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
562   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
563   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
564   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
565 }
566 
transpose_16x8_to_8x16(unsigned char * src,int in_p,unsigned char * dst,int out_p)567 static INLINE void transpose_16x8_to_8x16(unsigned char *src, int in_p,
568                                           unsigned char *dst, int out_p) {
569   // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0
570   // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1
571   // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2
572   // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3
573   // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4
574   // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5
575   // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6
576   // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7
577   const __m128i x0 = _mm_loadu_si128((__m128i *)(src));
578   const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p)));
579   const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p)));
580   const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p)));
581   const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p)));
582   const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p)));
583   const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p)));
584   const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p)));
585 
586   // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1
587   // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1
588   // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3
589   // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3
590   // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5
591   // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5
592   // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7
593   // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7
594   const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1);
595   const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1);
596   const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3);
597   const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3);
598   const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5);
599   const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5);
600   const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7);
601   const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7);
602 
603   // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3
604   // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3
605   // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3
606   // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3
607   // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7
608   // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7
609   // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7
610   // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7
611   const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12);
612   const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12);
613   const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13);
614   const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13);
615   const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16);
616   const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16);
617   const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17);
618   const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17);
619 
620   // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7
621   // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7
622   // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7
623   // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7
624   // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7
625   // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7
626   // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7
627   // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7
628   const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24);
629   const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24);
630   const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25);
631   const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25);
632   const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26);
633   const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26);
634   const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27);
635   const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27);
636 
637   mm_storelu(dst, x_s30);
638   mm_storehu(dst + (1 * out_p), x_s30);
639   mm_storelu(dst + (2 * out_p), x_s31);
640   mm_storehu(dst + (3 * out_p), x_s31);
641   mm_storelu(dst + (4 * out_p), x_s32);
642   mm_storehu(dst + (5 * out_p), x_s32);
643   mm_storelu(dst + (6 * out_p), x_s33);
644   mm_storehu(dst + (7 * out_p), x_s33);
645   mm_storelu(dst + (8 * out_p), x_s34);
646   mm_storehu(dst + (9 * out_p), x_s34);
647   mm_storelu(dst + (10 * out_p), x_s35);
648   mm_storehu(dst + (11 * out_p), x_s35);
649   mm_storelu(dst + (12 * out_p), x_s36);
650   mm_storehu(dst + (13 * out_p), x_s36);
651   mm_storelu(dst + (14 * out_p), x_s37);
652   mm_storehu(dst + (15 * out_p), x_s37);
653 }
654 
transpose_8xn(unsigned char * src[],int in_p,unsigned char * dst[],int out_p,int num_8x8_to_transpose)655 static INLINE void transpose_8xn(unsigned char *src[], int in_p,
656                                  unsigned char *dst[], int out_p,
657                                  int num_8x8_to_transpose) {
658   int idx8x8 = 0;
659   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
660   do {
661     unsigned char *in = src[idx8x8];
662     unsigned char *out = dst[idx8x8];
663 
664     x0 =
665         _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
666     x1 =
667         _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
668     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
669     x0 = _mm_unpacklo_epi8(x0, x1);
670 
671     x2 =
672         _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
673     x3 =
674         _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
675     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
676     x1 = _mm_unpacklo_epi8(x2, x3);
677 
678     x4 =
679         _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
680     x5 =
681         _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
682     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
683     x2 = _mm_unpacklo_epi8(x4, x5);
684 
685     x6 =
686         _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
687     x7 =
688         _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
689     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
690     x3 = _mm_unpacklo_epi8(x6, x7);
691 
692     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
693     x4 = _mm_unpacklo_epi16(x0, x1);
694     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
695     x5 = _mm_unpacklo_epi16(x2, x3);
696     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
697     x6 = _mm_unpacklo_epi32(x4, x5);
698     mm_storelu(out + 0 * out_p, x6);  // 00 10 20 30 40 50 60 70
699     mm_storehu(out + 1 * out_p, x6);  // 01 11 21 31 41 51 61 71
700     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
701     x7 = _mm_unpackhi_epi32(x4, x5);
702     mm_storelu(out + 2 * out_p, x7);  // 02 12 22 32 42 52 62 72
703     mm_storehu(out + 3 * out_p, x7);  // 03 13 23 33 43 53 63 73
704 
705     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
706     x4 = _mm_unpackhi_epi16(x0, x1);
707     // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
708     x5 = _mm_unpackhi_epi16(x2, x3);
709     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
710     x6 = _mm_unpacklo_epi32(x4, x5);
711     mm_storelu(out + 4 * out_p, x6);  // 04 14 24 34 44 54 64 74
712     mm_storehu(out + 5 * out_p, x6);  // 05 15 25 35 45 55 65 75
713     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
714     x7 = _mm_unpackhi_epi32(x4, x5);
715 
716     mm_storelu(out + 6 * out_p, x7);  // 06 16 26 36 46 56 66 76
717     mm_storehu(out + 7 * out_p, x7);  // 07 17 27 37 47 57 67 77
718   } while (++idx8x8 < num_8x8_to_transpose);
719 }
720 
721 #endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
722