1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <tmmintrin.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16
17 // -----------------------------------------------------------------------------
18 /*
19 ; ------------------------------------------
20 ; input: x, y, z, result
21 ;
22 ; trick from pascal
23 ; (x+2y+z+2)>>2 can be calculated as:
24 ; result = avg(x,z)
25 ; result -= xor(x,z) & 1
26 ; result = avg(result,y)
27 ; ------------------------------------------
28 */
avg3_epu16(const __m128i * x,const __m128i * y,const __m128i * z)29 static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
30 const __m128i *z) {
31 const __m128i one = _mm_set1_epi16(1);
32 const __m128i a = _mm_avg_epu16(*x, *z);
33 const __m128i b =
34 _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
35 return _mm_avg_epu16(b, *y);
36 }
37
vpx_highbd_d45_predictor_4x4_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)38 void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
39 const uint16_t *above,
40 const uint16_t *left, int bd) {
41 const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
42 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
43 const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
44 const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
45 (void)left;
46 (void)bd;
47 _mm_storel_epi64((__m128i *)dst, avg3);
48 dst += stride;
49 _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
50 dst += stride;
51 _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
52 dst += stride;
53 _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
54 dst[3] = above[7]; // aka H
55 }
56
d45_store_8(uint16_t ** dst,const ptrdiff_t stride,__m128i * row,const __m128i * ar)57 static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
58 __m128i *row, const __m128i *ar) {
59 *row = _mm_alignr_epi8(*ar, *row, 2);
60 _mm_store_si128((__m128i *)*dst, *row);
61 *dst += stride;
62 }
63
vpx_highbd_d45_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)64 void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
65 const uint16_t *above,
66 const uint16_t *left, int bd) {
67 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
68 const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
69 const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
70 const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
71 const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
72 __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
73 (void)left;
74 (void)bd;
75 _mm_store_si128((__m128i *)dst, avg3);
76 dst += stride;
77 d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
78 d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
79 d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
80 d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
81 d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
82 d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
83 d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
84 }
85
d45_store_16(uint16_t ** dst,const ptrdiff_t stride,__m128i * row_0,__m128i * row_1,const __m128i * ar)86 static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
87 __m128i *row_0, __m128i *row_1,
88 const __m128i *ar) {
89 *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
90 *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
91 _mm_store_si128((__m128i *)*dst, *row_0);
92 _mm_store_si128((__m128i *)(*dst + 8), *row_1);
93 *dst += stride;
94 }
95
vpx_highbd_d45_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)96 void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
97 const uint16_t *above,
98 const uint16_t *left, int bd) {
99 const __m128i A0 = _mm_load_si128((const __m128i *)above);
100 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
101 const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
102 const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
103 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
104 const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
105 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
106 const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
107 __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
108 __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
109 (void)left;
110 (void)bd;
111 _mm_store_si128((__m128i *)dst, avg3_0);
112 _mm_store_si128((__m128i *)(dst + 8), avg3_1);
113 dst += stride;
114 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
115 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
116 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
117 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
118 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
119 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
120 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
121 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
122 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
123 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
124 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
125 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
126 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
127 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
128 d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
129 }
130
vpx_highbd_d45_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)131 void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
132 const uint16_t *above,
133 const uint16_t *left, int bd) {
134 const __m128i A0 = _mm_load_si128((const __m128i *)above);
135 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
136 const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
137 const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
138 const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
139 const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
140 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
141 const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
142 const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
143 const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
144 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
145 const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
146 const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
147 const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
148 __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
149 __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
150 __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
151 __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
152 int i;
153 (void)left;
154 (void)bd;
155 _mm_store_si128((__m128i *)dst, avg3_0);
156 _mm_store_si128((__m128i *)(dst + 8), avg3_1);
157 _mm_store_si128((__m128i *)(dst + 16), avg3_2);
158 _mm_store_si128((__m128i *)(dst + 24), avg3_3);
159 dst += stride;
160 for (i = 1; i < 32; ++i) {
161 avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
162 avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
163 avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
164 avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
165 _mm_store_si128((__m128i *)dst, avg3_0);
166 _mm_store_si128((__m128i *)(dst + 8), avg3_1);
167 _mm_store_si128((__m128i *)(dst + 16), avg3_2);
168 _mm_store_si128((__m128i *)(dst + 24), avg3_3);
169 dst += stride;
170 }
171 }
172
173 DECLARE_ALIGNED(16, static const uint8_t,
174 rotate_right_epu16[16]) = { 2, 3, 4, 5, 6, 7, 8, 9,
175 10, 11, 12, 13, 14, 15, 0, 1 };
176
rotr_epu16(__m128i * a,const __m128i * rotrw)177 static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
178 *a = _mm_shuffle_epi8(*a, *rotrw);
179 return *a;
180 }
181
vpx_highbd_d117_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)182 void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
183 const uint16_t *above,
184 const uint16_t *left, int bd) {
185 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
186 const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
187 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
188 const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
189 const __m128i IXABCDEF =
190 _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
191 const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
192 const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
193 const __m128i XIJKLMNO =
194 _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
195 const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
196 __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
197 __m128i rowa = avg2;
198 __m128i rowb = avg3;
199 int i;
200 (void)bd;
201 for (i = 0; i < 8; i += 2) {
202 _mm_store_si128((__m128i *)dst, rowa);
203 dst += stride;
204 _mm_store_si128((__m128i *)dst, rowb);
205 dst += stride;
206 rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
207 rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
208 }
209 }
210
vpx_highbd_d117_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)211 void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
212 const uint16_t *above,
213 const uint16_t *left, int bd) {
214 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
215 const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
216 const __m128i A0 = _mm_load_si128((const __m128i *)above);
217 const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
218 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
219 const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
220 const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
221 const __m128i L0 = _mm_load_si128((const __m128i *)left);
222 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
223 const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
224 const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
225 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
226 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
227 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
228 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
229 const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
230 const __m128i L1_ = _mm_srli_si128(L1, 2);
231 __m128i rowa_0 = avg2_0;
232 __m128i rowa_1 = avg2_1;
233 __m128i rowb_0 = avg3_0;
234 __m128i rowb_1 = avg3_1;
235 __m128i avg3_left[2];
236 int i, j;
237 (void)bd;
238 avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
239 avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
240 for (i = 0; i < 2; ++i) {
241 __m128i avg_left = avg3_left[i];
242 for (j = 0; j < 8; j += 2) {
243 _mm_store_si128((__m128i *)dst, rowa_0);
244 _mm_store_si128((__m128i *)(dst + 8), rowa_1);
245 dst += stride;
246 _mm_store_si128((__m128i *)dst, rowb_0);
247 _mm_store_si128((__m128i *)(dst + 8), rowb_1);
248 dst += stride;
249 rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
250 rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
251 rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
252 rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
253 }
254 }
255 }
256
vpx_highbd_d117_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)257 void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
258 const uint16_t *above,
259 const uint16_t *left, int bd) {
260 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
261 const __m128i A0 = _mm_load_si128((const __m128i *)above);
262 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
263 const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
264 const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
265 const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
266 const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
267 const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
268 const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
269 const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
270 const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
271 const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
272 const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
273 const __m128i L0 = _mm_load_si128((const __m128i *)left);
274 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
275 const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
276 const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
277 const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
278 const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
279 const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
280 const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
281 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
282 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
283 const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
284 const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
285 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
286 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
287 const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
288 const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
289 const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
290 const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
291 const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
292 const __m128i L3_ = _mm_srli_si128(L3, 2);
293 __m128i rowa_0 = avg2_0;
294 __m128i rowa_1 = avg2_1;
295 __m128i rowa_2 = avg2_2;
296 __m128i rowa_3 = avg2_3;
297 __m128i rowb_0 = avg3_0;
298 __m128i rowb_1 = avg3_1;
299 __m128i rowb_2 = avg3_2;
300 __m128i rowb_3 = avg3_3;
301 __m128i avg3_left[4];
302 int i, j;
303 (void)bd;
304 avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
305 avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
306 avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
307 avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
308 for (i = 0; i < 4; ++i) {
309 __m128i avg_left = avg3_left[i];
310 for (j = 0; j < 8; j += 2) {
311 _mm_store_si128((__m128i *)dst, rowa_0);
312 _mm_store_si128((__m128i *)(dst + 8), rowa_1);
313 _mm_store_si128((__m128i *)(dst + 16), rowa_2);
314 _mm_store_si128((__m128i *)(dst + 24), rowa_3);
315 dst += stride;
316 _mm_store_si128((__m128i *)dst, rowb_0);
317 _mm_store_si128((__m128i *)(dst + 8), rowb_1);
318 _mm_store_si128((__m128i *)(dst + 16), rowb_2);
319 _mm_store_si128((__m128i *)(dst + 24), rowb_3);
320 dst += stride;
321 rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
322 rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
323 rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
324 rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
325 rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
326 rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
327 rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
328 rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
329 }
330 }
331 }
332
vpx_highbd_d135_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)333 void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
334 const uint16_t *above,
335 const uint16_t *left, int bd) {
336 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
337 const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
338 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
339 const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
340 const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
341 const __m128i XIJKLMNO =
342 _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
343 const __m128i AXIJKLMN =
344 _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
345 const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
346 __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
347 __m128i rowa = avg3;
348 int i;
349 (void)bd;
350 for (i = 0; i < 8; ++i) {
351 rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
352 _mm_store_si128((__m128i *)dst, rowa);
353 dst += stride;
354 }
355 }
356
vpx_highbd_d135_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)357 void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
358 const uint16_t *above,
359 const uint16_t *left, int bd) {
360 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
361 const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
362 const __m128i B0 = _mm_load_si128((const __m128i *)above);
363 const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
364 const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
365 const __m128i L0 = _mm_load_si128((const __m128i *)left);
366 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
367 const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
368 const __m128i C1 = _mm_srli_si128(B1, 2);
369 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
370 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
371 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
372 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
373 const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
374 const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
375 __m128i rowa_0 = avg3_0;
376 __m128i rowa_1 = avg3_1;
377 __m128i avg3_left[2];
378 int i, j;
379 (void)bd;
380 avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
381 avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
382 for (i = 0; i < 2; ++i) {
383 __m128i avg_left = avg3_left[i];
384 for (j = 0; j < 8; ++j) {
385 rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
386 rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
387 _mm_store_si128((__m128i *)dst, rowa_0);
388 _mm_store_si128((__m128i *)(dst + 8), rowa_1);
389 dst += stride;
390 }
391 }
392 }
393
vpx_highbd_d135_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)394 void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
395 const uint16_t *above,
396 const uint16_t *left, int bd) {
397 const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
398 const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
399 const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
400 const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
401 const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
402 const __m128i B0 = _mm_load_si128((const __m128i *)above);
403 const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
404 const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
405 const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
406 const __m128i L0 = _mm_load_si128((const __m128i *)left);
407 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
408 const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
409 const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
410 const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
411 const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
412 const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
413 const __m128i C3 = _mm_srli_si128(B3, 2);
414 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
415 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
416 const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
417 const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
418 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
419 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
420 const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
421 const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
422 const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
423 const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
424 const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
425 const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
426 __m128i rowa_0 = avg3_0;
427 __m128i rowa_1 = avg3_1;
428 __m128i rowa_2 = avg3_2;
429 __m128i rowa_3 = avg3_3;
430 __m128i avg3_left[4];
431 int i, j;
432 (void)bd;
433 avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
434 avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
435 avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
436 avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
437 for (i = 0; i < 4; ++i) {
438 __m128i avg_left = avg3_left[i];
439 for (j = 0; j < 8; ++j) {
440 rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
441 rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
442 rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
443 rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
444 _mm_store_si128((__m128i *)dst, rowa_0);
445 _mm_store_si128((__m128i *)(dst + 8), rowa_1);
446 _mm_store_si128((__m128i *)(dst + 16), rowa_2);
447 _mm_store_si128((__m128i *)(dst + 24), rowa_3);
448 dst += stride;
449 }
450 }
451 }
452
vpx_highbd_d153_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)453 void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
454 const uint16_t *above,
455 const uint16_t *left, int bd) {
456 const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
457 const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
458 const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
459 const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
460 const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
461 const __m128i XIJKLMNO =
462 _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
463 const __m128i AXIJKLMN =
464 _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
465 const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
466 const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
467 const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
468 const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
469 const __m128i row0 =
470 _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
471 const __m128i row1 =
472 _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
473 const __m128i row2 =
474 _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
475 const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
476 const __m128i row4 =
477 _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
478 const __m128i row5 =
479 _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
480 const __m128i row6 =
481 _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
482 const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
483 (void)bd;
484 _mm_store_si128((__m128i *)dst, row0);
485 dst += stride;
486 _mm_store_si128((__m128i *)dst, row1);
487 dst += stride;
488 _mm_store_si128((__m128i *)dst, row2);
489 dst += stride;
490 _mm_store_si128((__m128i *)dst, row3);
491 dst += stride;
492 _mm_store_si128((__m128i *)dst, row4);
493 dst += stride;
494 _mm_store_si128((__m128i *)dst, row5);
495 dst += stride;
496 _mm_store_si128((__m128i *)dst, row6);
497 dst += stride;
498 _mm_store_si128((__m128i *)dst, row7);
499 }
500
vpx_highbd_d153_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)501 void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
502 const uint16_t *above,
503 const uint16_t *left, int bd) {
504 const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
505 const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
506 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
507 const __m128i B1 = _mm_srli_si128(A1, 2);
508 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
509 const __m128i C1 = _mm_srli_si128(A1, 4);
510 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
511 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
512 const __m128i L0 = _mm_load_si128((const __m128i *)left);
513 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
514 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
515 const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
516 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
517 const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
518 const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
519 const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
520 const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
521 const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
522 __m128i row_0 = avg3_0;
523 __m128i row_1 = avg3_1;
524 __m128i avg2_avg3_left[2][2];
525 int i, j;
526 (void)bd;
527
528 avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
529 avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
530 avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
531 avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
532
533 for (j = 0; j < 2; ++j) {
534 for (i = 0; i < 2; ++i) {
535 const __m128i avg2_avg3 = avg2_avg3_left[j][i];
536 row_1 = _mm_alignr_epi8(row_1, row_0, 12);
537 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
538 _mm_store_si128((__m128i *)dst, row_0);
539 _mm_store_si128((__m128i *)(dst + 8), row_1);
540 dst += stride;
541 row_1 = _mm_alignr_epi8(row_1, row_0, 12);
542 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
543 _mm_store_si128((__m128i *)dst, row_0);
544 _mm_store_si128((__m128i *)(dst + 8), row_1);
545 dst += stride;
546 row_1 = _mm_alignr_epi8(row_1, row_0, 12);
547 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
548 _mm_store_si128((__m128i *)dst, row_0);
549 _mm_store_si128((__m128i *)(dst + 8), row_1);
550 dst += stride;
551 row_1 = _mm_alignr_epi8(row_1, row_0, 12);
552 row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
553 _mm_store_si128((__m128i *)dst, row_0);
554 _mm_store_si128((__m128i *)(dst + 8), row_1);
555 dst += stride;
556 }
557 }
558 }
559
vpx_highbd_d153_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)560 void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
561 const uint16_t *above,
562 const uint16_t *left, int bd) {
563 const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
564 const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
565 const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
566 const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
567 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
568 const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
569 const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
570 const __m128i B3 = _mm_srli_si128(A3, 2);
571 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
572 const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
573 const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
574 const __m128i C3 = _mm_srli_si128(A3, 4);
575 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
576 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
577 const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
578 const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
579 const __m128i L0 = _mm_load_si128((const __m128i *)left);
580 const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
581 const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
582 const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
583 const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
584 const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
585 const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
586 const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
587 const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
588 const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
589 const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
590 const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
591 const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
592 const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
593 const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
594 const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
595 const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
596 const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
597 const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
598 const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
599 __m128i row_0 = avg3_0;
600 __m128i row_1 = avg3_1;
601 __m128i row_2 = avg3_2;
602 __m128i row_3 = avg3_3;
603 __m128i avg2_avg3_left[4][2];
604 int i, j;
605 (void)bd;
606
607 avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
608 avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
609 avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
610 avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
611 avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
612 avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
613 avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
614 avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
615
616 for (j = 0; j < 4; ++j) {
617 for (i = 0; i < 2; ++i) {
618 const __m128i avg2_avg3 = avg2_avg3_left[j][i];
619 row_3 = _mm_alignr_epi8(row_3, row_2, 12);
620 row_2 = _mm_alignr_epi8(row_2, row_1, 12);
621 row_1 = _mm_alignr_epi8(row_1, row_0, 12);
622 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
623 _mm_store_si128((__m128i *)dst, row_0);
624 _mm_store_si128((__m128i *)(dst + 8), row_1);
625 _mm_store_si128((__m128i *)(dst + 16), row_2);
626 _mm_store_si128((__m128i *)(dst + 24), row_3);
627 dst += stride;
628 row_3 = _mm_alignr_epi8(row_3, row_2, 12);
629 row_2 = _mm_alignr_epi8(row_2, row_1, 12);
630 row_1 = _mm_alignr_epi8(row_1, row_0, 12);
631 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
632 _mm_store_si128((__m128i *)dst, row_0);
633 _mm_store_si128((__m128i *)(dst + 8), row_1);
634 _mm_store_si128((__m128i *)(dst + 16), row_2);
635 _mm_store_si128((__m128i *)(dst + 24), row_3);
636 dst += stride;
637 row_3 = _mm_alignr_epi8(row_3, row_2, 12);
638 row_2 = _mm_alignr_epi8(row_2, row_1, 12);
639 row_1 = _mm_alignr_epi8(row_1, row_0, 12);
640 row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
641 _mm_store_si128((__m128i *)dst, row_0);
642 _mm_store_si128((__m128i *)(dst + 8), row_1);
643 _mm_store_si128((__m128i *)(dst + 16), row_2);
644 _mm_store_si128((__m128i *)(dst + 24), row_3);
645 dst += stride;
646 row_3 = _mm_alignr_epi8(row_3, row_2, 12);
647 row_2 = _mm_alignr_epi8(row_2, row_1, 12);
648 row_1 = _mm_alignr_epi8(row_1, row_0, 12);
649 row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
650 _mm_store_si128((__m128i *)dst, row_0);
651 _mm_store_si128((__m128i *)(dst + 8), row_1);
652 _mm_store_si128((__m128i *)(dst + 16), row_2);
653 _mm_store_si128((__m128i *)(dst + 24), row_3);
654 dst += stride;
655 }
656 }
657 }
658
d207_store_4x8(uint16_t ** dst,const ptrdiff_t stride,const __m128i * a,const __m128i * b)659 static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
660 const __m128i *a, const __m128i *b) {
661 _mm_store_si128((__m128i *)*dst, *a);
662 *dst += stride;
663 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
664 *dst += stride;
665 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
666 *dst += stride;
667 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
668 *dst += stride;
669 }
670
vpx_highbd_d207_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)671 void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
672 const uint16_t *above,
673 const uint16_t *left, int bd) {
674 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
675 const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
676 const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
677 const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
678 const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
679 const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
680 const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
681 const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
682 const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
683 (void)above;
684 (void)bd;
685 d207_store_4x8(&dst, stride, &out_a, &out_b);
686 d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
687 }
688
d207_store_4x16(uint16_t ** dst,const ptrdiff_t stride,const __m128i * a,const __m128i * b,const __m128i * c)689 static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
690 const __m128i *a, const __m128i *b,
691 const __m128i *c) {
692 _mm_store_si128((__m128i *)*dst, *a);
693 _mm_store_si128((__m128i *)(*dst + 8), *b);
694 *dst += stride;
695 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
696 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
697 *dst += stride;
698 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
699 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
700 *dst += stride;
701 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
702 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
703 *dst += stride;
704 }
705
vpx_highbd_d207_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)706 void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
707 const uint16_t *above,
708 const uint16_t *left, int bd) {
709 const __m128i A0 = _mm_load_si128((const __m128i *)left);
710 const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
711 const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
712 const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
713 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
714 const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
715 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
716 const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
717 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
718 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
719 const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
720 const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
721 const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
722 const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
723 const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
724 const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
725 (void)above;
726 (void)bd;
727 d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
728 d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
729 d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
730 d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
731 }
732
d207_store_4x32(uint16_t ** dst,const ptrdiff_t stride,const __m128i * a,const __m128i * b,const __m128i * c,const __m128i * d,const __m128i * e)733 static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
734 const __m128i *a, const __m128i *b,
735 const __m128i *c, const __m128i *d,
736 const __m128i *e) {
737 _mm_store_si128((__m128i *)*dst, *a);
738 _mm_store_si128((__m128i *)(*dst + 8), *b);
739 _mm_store_si128((__m128i *)(*dst + 16), *c);
740 _mm_store_si128((__m128i *)(*dst + 24), *d);
741 *dst += stride;
742 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
743 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
744 _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
745 _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
746 *dst += stride;
747 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
748 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
749 _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
750 _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
751 *dst += stride;
752 _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
753 _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
754 _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
755 _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
756 *dst += stride;
757 }
758
vpx_highbd_d207_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)759 void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
760 const uint16_t *above,
761 const uint16_t *left, int bd) {
762 const __m128i A0 = _mm_load_si128((const __m128i *)left);
763 const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
764 const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
765 const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
766 const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
767 const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
768 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
769 const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
770 const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
771 const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
772 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
773 const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
774 const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
775 const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
776 const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
777 const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
778 const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
779 const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
780 const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
781 const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
782 const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
783 const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
784 const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
785 const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
786 const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
787 const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
788 const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
789 const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
790 const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
791 const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
792 (void)above;
793 (void)bd;
794 d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
795 d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
796 d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
797 d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
798 d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
799 d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
800 d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
801 d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
802 }
803
d63_store_4x8(uint16_t ** dst,const ptrdiff_t stride,__m128i * a,__m128i * b,const __m128i * ar)804 static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
805 __m128i *a, __m128i *b, const __m128i *ar) {
806 _mm_store_si128((__m128i *)*dst, *a);
807 *dst += stride;
808 _mm_store_si128((__m128i *)*dst, *b);
809 *dst += stride;
810 *a = _mm_alignr_epi8(*ar, *a, 2);
811 *b = _mm_alignr_epi8(*ar, *b, 2);
812 _mm_store_si128((__m128i *)*dst, *a);
813 *dst += stride;
814 _mm_store_si128((__m128i *)*dst, *b);
815 *dst += stride;
816 *a = _mm_alignr_epi8(*ar, *a, 2);
817 *b = _mm_alignr_epi8(*ar, *b, 2);
818 }
819
vpx_highbd_d63_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)820 void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
821 const uint16_t *above,
822 const uint16_t *left, int bd) {
823 const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
824 const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
825 const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
826 const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
827 const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
828 __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
829 __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
830 (void)left;
831 (void)bd;
832 d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
833 d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
834 }
835
vpx_highbd_d63_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)836 void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
837 const uint16_t *above,
838 const uint16_t *left, int bd) {
839 const __m128i A0 = _mm_load_si128((const __m128i *)above);
840 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
841 const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
842 const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
843 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
844 const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
845 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
846 const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
847 __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
848 __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
849 __m128i avg2_0 = _mm_avg_epu16(A0, B0);
850 __m128i avg2_1 = _mm_avg_epu16(A1, B1);
851 int i;
852 (void)left;
853 (void)bd;
854 for (i = 0; i < 14; i += 2) {
855 _mm_store_si128((__m128i *)dst, avg2_0);
856 _mm_store_si128((__m128i *)(dst + 8), avg2_1);
857 dst += stride;
858 _mm_store_si128((__m128i *)dst, avg3_0);
859 _mm_store_si128((__m128i *)(dst + 8), avg3_1);
860 dst += stride;
861 avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
862 avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
863 avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
864 avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
865 }
866 _mm_store_si128((__m128i *)dst, avg2_0);
867 _mm_store_si128((__m128i *)(dst + 8), avg2_1);
868 dst += stride;
869 _mm_store_si128((__m128i *)dst, avg3_0);
870 _mm_store_si128((__m128i *)(dst + 8), avg3_1);
871 }
872
vpx_highbd_d63_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)873 void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
874 const uint16_t *above,
875 const uint16_t *left, int bd) {
876 const __m128i A0 = _mm_load_si128((const __m128i *)above);
877 const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
878 const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
879 const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
880 const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
881 const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
882 const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
883 const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
884 const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
885 const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
886 const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
887 const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
888 const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
889 const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
890 __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
891 __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
892 __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
893 __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
894 __m128i avg2_0 = _mm_avg_epu16(A0, B0);
895 __m128i avg2_1 = _mm_avg_epu16(A1, B1);
896 __m128i avg2_2 = _mm_avg_epu16(A2, B2);
897 __m128i avg2_3 = _mm_avg_epu16(A3, B3);
898 int i;
899 (void)left;
900 (void)bd;
901 for (i = 0; i < 30; i += 2) {
902 _mm_store_si128((__m128i *)dst, avg2_0);
903 _mm_store_si128((__m128i *)(dst + 8), avg2_1);
904 _mm_store_si128((__m128i *)(dst + 16), avg2_2);
905 _mm_store_si128((__m128i *)(dst + 24), avg2_3);
906 dst += stride;
907 _mm_store_si128((__m128i *)dst, avg3_0);
908 _mm_store_si128((__m128i *)(dst + 8), avg3_1);
909 _mm_store_si128((__m128i *)(dst + 16), avg3_2);
910 _mm_store_si128((__m128i *)(dst + 24), avg3_3);
911 dst += stride;
912 avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
913 avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
914 avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
915 avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
916 avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
917 avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
918 avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
919 avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
920 }
921 _mm_store_si128((__m128i *)dst, avg2_0);
922 _mm_store_si128((__m128i *)(dst + 8), avg2_1);
923 _mm_store_si128((__m128i *)(dst + 16), avg2_2);
924 _mm_store_si128((__m128i *)(dst + 24), avg2_3);
925 dst += stride;
926 _mm_store_si128((__m128i *)dst, avg3_0);
927 _mm_store_si128((__m128i *)(dst + 8), avg3_1);
928 _mm_store_si128((__m128i *)(dst + 16), avg3_2);
929 _mm_store_si128((__m128i *)(dst + 24), avg3_3);
930 }
931