1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <tmmintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
16 #include "aom_dsp/intrapred_common.h"
17
18 // -----------------------------------------------------------------------------
19 // PAETH_PRED
20
21 // Return 8 16-bit pixels in one row
paeth_8x1_pred(const __m128i * left,const __m128i * top,const __m128i * topleft)22 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23 const __m128i *topleft) {
24 const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25
26 __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27 __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28 __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29
30 __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31 mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32 __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33
34 pl = _mm_andnot_si128(mask1, *left);
35
36 ptl = _mm_and_si128(mask2, *topleft);
37 pt = _mm_andnot_si128(mask2, *top);
38 pt = _mm_or_si128(pt, ptl);
39 pt = _mm_and_si128(mask1, pt);
40
41 return _mm_or_si128(pl, pt);
42 }
43
aom_paeth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45 const uint8_t *above, const uint8_t *left) {
46 __m128i l = _mm_loadl_epi64((const __m128i *)left);
47 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48 const __m128i zero = _mm_setzero_si128();
49 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
51 __m128i rep = _mm_set1_epi16((short)0x8000);
52 const __m128i one = _mm_set1_epi16(1);
53
54 int i;
55 for (i = 0; i < 4; ++i) {
56 const __m128i l16 = _mm_shuffle_epi8(l, rep);
57 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58
59 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60 dst += stride;
61 rep = _mm_add_epi16(rep, one);
62 }
63 }
64
aom_paeth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66 const uint8_t *above, const uint8_t *left) {
67 __m128i l = _mm_loadl_epi64((const __m128i *)left);
68 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69 const __m128i zero = _mm_setzero_si128();
70 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
72 __m128i rep = _mm_set1_epi16((short)0x8000);
73 const __m128i one = _mm_set1_epi16(1);
74
75 int i;
76 for (i = 0; i < 8; ++i) {
77 const __m128i l16 = _mm_shuffle_epi8(l, rep);
78 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79
80 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81 dst += stride;
82 rep = _mm_add_epi16(rep, one);
83 }
84 }
85
aom_paeth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)86 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87 const uint8_t *above, const uint8_t *left) {
88 __m128i l = _mm_load_si128((const __m128i *)left);
89 const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
90 const __m128i zero = _mm_setzero_si128();
91 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
92 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
93 __m128i rep = _mm_set1_epi16((short)0x8000);
94 const __m128i one = _mm_set1_epi16(1);
95
96 for (int i = 0; i < 16; ++i) {
97 const __m128i l16 = _mm_shuffle_epi8(l, rep);
98 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99
100 *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
101 dst += stride;
102 rep = _mm_add_epi16(rep, one);
103 }
104 }
105
aom_paeth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)106 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107 const uint8_t *above, const uint8_t *left) {
108 __m128i l = _mm_loadl_epi64((const __m128i *)left);
109 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110 const __m128i zero = _mm_setzero_si128();
111 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
113 __m128i rep = _mm_set1_epi16((short)0x8000);
114 const __m128i one = _mm_set1_epi16(1);
115
116 int i;
117 for (i = 0; i < 4; ++i) {
118 const __m128i l16 = _mm_shuffle_epi8(l, rep);
119 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120
121 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122 dst += stride;
123 rep = _mm_add_epi16(rep, one);
124 }
125 }
126
aom_paeth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)127 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128 const uint8_t *above, const uint8_t *left) {
129 __m128i l = _mm_loadl_epi64((const __m128i *)left);
130 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131 const __m128i zero = _mm_setzero_si128();
132 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
134 __m128i rep = _mm_set1_epi16((short)0x8000);
135 const __m128i one = _mm_set1_epi16(1);
136
137 int i;
138 for (i = 0; i < 8; ++i) {
139 const __m128i l16 = _mm_shuffle_epi8(l, rep);
140 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141
142 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143 dst += stride;
144 rep = _mm_add_epi16(rep, one);
145 }
146 }
147
aom_paeth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)148 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149 const uint8_t *above, const uint8_t *left) {
150 __m128i l = _mm_load_si128((const __m128i *)left);
151 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152 const __m128i zero = _mm_setzero_si128();
153 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
154 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
155 __m128i rep = _mm_set1_epi16((short)0x8000);
156 const __m128i one = _mm_set1_epi16(1);
157
158 int i;
159 for (i = 0; i < 16; ++i) {
160 const __m128i l16 = _mm_shuffle_epi8(l, rep);
161 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162
163 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164 dst += stride;
165 rep = _mm_add_epi16(rep, one);
166 }
167 }
168
aom_paeth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170 const uint8_t *above, const uint8_t *left) {
171 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172 const __m128i zero = _mm_setzero_si128();
173 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
174 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
175 const __m128i one = _mm_set1_epi16(1);
176
177 for (int j = 0; j < 2; ++j) {
178 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
179 __m128i rep = _mm_set1_epi16((short)0x8000);
180 for (int i = 0; i < 16; ++i) {
181 const __m128i l16 = _mm_shuffle_epi8(l, rep);
182 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183
184 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185 dst += stride;
186 rep = _mm_add_epi16(rep, one);
187 }
188 }
189 }
190
191 // Return 16 8-bit pixels in one row
paeth_16x1_pred(const __m128i * left,const __m128i * top0,const __m128i * top1,const __m128i * topleft)192 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193 const __m128i *top1,
194 const __m128i *topleft) {
195 const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196 const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197 return _mm_packus_epi16(p0, p1);
198 }
199
aom_paeth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)200 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201 const uint8_t *above, const uint8_t *left) {
202 __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
203 const __m128i t = _mm_load_si128((const __m128i *)above);
204 const __m128i zero = _mm_setzero_si128();
205 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
207 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
208 __m128i rep = _mm_set1_epi16((short)0x8000);
209 const __m128i one = _mm_set1_epi16(1);
210
211 for (int i = 0; i < 4; ++i) {
212 const __m128i l16 = _mm_shuffle_epi8(l, rep);
213 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214
215 _mm_store_si128((__m128i *)dst, row);
216 dst += stride;
217 rep = _mm_add_epi16(rep, one);
218 }
219 }
220
aom_paeth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)221 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222 const uint8_t *above, const uint8_t *left) {
223 __m128i l = _mm_loadl_epi64((const __m128i *)left);
224 const __m128i t = _mm_load_si128((const __m128i *)above);
225 const __m128i zero = _mm_setzero_si128();
226 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
228 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
229 __m128i rep = _mm_set1_epi16((short)0x8000);
230 const __m128i one = _mm_set1_epi16(1);
231
232 int i;
233 for (i = 0; i < 8; ++i) {
234 const __m128i l16 = _mm_shuffle_epi8(l, rep);
235 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236
237 _mm_store_si128((__m128i *)dst, row);
238 dst += stride;
239 rep = _mm_add_epi16(rep, one);
240 }
241 }
242
aom_paeth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)243 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244 const uint8_t *above,
245 const uint8_t *left) {
246 __m128i l = _mm_load_si128((const __m128i *)left);
247 const __m128i t = _mm_load_si128((const __m128i *)above);
248 const __m128i zero = _mm_setzero_si128();
249 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
251 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
252 __m128i rep = _mm_set1_epi16((short)0x8000);
253 const __m128i one = _mm_set1_epi16(1);
254
255 int i;
256 for (i = 0; i < 16; ++i) {
257 const __m128i l16 = _mm_shuffle_epi8(l, rep);
258 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259
260 _mm_store_si128((__m128i *)dst, row);
261 dst += stride;
262 rep = _mm_add_epi16(rep, one);
263 }
264 }
265
aom_paeth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)266 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267 const uint8_t *above,
268 const uint8_t *left) {
269 __m128i l = _mm_load_si128((const __m128i *)left);
270 const __m128i t = _mm_load_si128((const __m128i *)above);
271 const __m128i zero = _mm_setzero_si128();
272 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
274 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
275 __m128i rep = _mm_set1_epi16((short)0x8000);
276 const __m128i one = _mm_set1_epi16(1);
277 __m128i l16;
278
279 int i;
280 for (i = 0; i < 16; ++i) {
281 l16 = _mm_shuffle_epi8(l, rep);
282 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283
284 _mm_store_si128((__m128i *)dst, row);
285 dst += stride;
286 rep = _mm_add_epi16(rep, one);
287 }
288
289 l = _mm_load_si128((const __m128i *)(left + 16));
290 rep = _mm_set1_epi16((short)0x8000);
291 for (i = 0; i < 16; ++i) {
292 l16 = _mm_shuffle_epi8(l, rep);
293 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294
295 _mm_store_si128((__m128i *)dst, row);
296 dst += stride;
297 rep = _mm_add_epi16(rep, one);
298 }
299 }
300
aom_paeth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302 const uint8_t *above,
303 const uint8_t *left) {
304 const __m128i t = _mm_load_si128((const __m128i *)above);
305 const __m128i zero = _mm_setzero_si128();
306 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
308 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
309 const __m128i one = _mm_set1_epi16(1);
310
311 for (int j = 0; j < 4; ++j) {
312 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
313 __m128i rep = _mm_set1_epi16((short)0x8000);
314 for (int i = 0; i < 16; ++i) {
315 const __m128i l16 = _mm_shuffle_epi8(l, rep);
316 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317 _mm_store_si128((__m128i *)dst, row);
318 dst += stride;
319 rep = _mm_add_epi16(rep, one);
320 }
321 }
322 }
323
aom_paeth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)324 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325 const uint8_t *above, const uint8_t *left) {
326 const __m128i a = _mm_load_si128((const __m128i *)above);
327 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328 const __m128i zero = _mm_setzero_si128();
329 const __m128i al = _mm_unpacklo_epi8(a, zero);
330 const __m128i ah = _mm_unpackhi_epi8(a, zero);
331 const __m128i bl = _mm_unpacklo_epi8(b, zero);
332 const __m128i bh = _mm_unpackhi_epi8(b, zero);
333
334 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
335 __m128i rep = _mm_set1_epi16((short)0x8000);
336 const __m128i one = _mm_set1_epi16(1);
337 const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338 __m128i l16;
339
340 for (int i = 0; i < 8; ++i) {
341 l16 = _mm_shuffle_epi8(l, rep);
342 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344
345 _mm_store_si128((__m128i *)dst, r32l);
346 _mm_store_si128((__m128i *)(dst + 16), r32h);
347 dst += stride;
348 rep = _mm_add_epi16(rep, one);
349 }
350 }
351
aom_paeth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)352 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353 const uint8_t *above,
354 const uint8_t *left) {
355 const __m128i a = _mm_load_si128((const __m128i *)above);
356 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357 const __m128i zero = _mm_setzero_si128();
358 const __m128i al = _mm_unpacklo_epi8(a, zero);
359 const __m128i ah = _mm_unpackhi_epi8(a, zero);
360 const __m128i bl = _mm_unpacklo_epi8(b, zero);
361 const __m128i bh = _mm_unpackhi_epi8(b, zero);
362
363 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
364 __m128i rep = _mm_set1_epi16((short)0x8000);
365 const __m128i one = _mm_set1_epi16(1);
366 __m128i l = _mm_load_si128((const __m128i *)left);
367 __m128i l16;
368
369 int i;
370 for (i = 0; i < 16; ++i) {
371 l16 = _mm_shuffle_epi8(l, rep);
372 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374
375 _mm_store_si128((__m128i *)dst, r32l);
376 _mm_store_si128((__m128i *)(dst + 16), r32h);
377 dst += stride;
378 rep = _mm_add_epi16(rep, one);
379 }
380 }
381
aom_paeth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)382 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383 const uint8_t *above,
384 const uint8_t *left) {
385 const __m128i a = _mm_load_si128((const __m128i *)above);
386 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387 const __m128i zero = _mm_setzero_si128();
388 const __m128i al = _mm_unpacklo_epi8(a, zero);
389 const __m128i ah = _mm_unpackhi_epi8(a, zero);
390 const __m128i bl = _mm_unpacklo_epi8(b, zero);
391 const __m128i bh = _mm_unpackhi_epi8(b, zero);
392
393 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
394 __m128i rep = _mm_set1_epi16((short)0x8000);
395 const __m128i one = _mm_set1_epi16(1);
396 __m128i l = _mm_load_si128((const __m128i *)left);
397 __m128i l16;
398
399 int i;
400 for (i = 0; i < 16; ++i) {
401 l16 = _mm_shuffle_epi8(l, rep);
402 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404
405 _mm_store_si128((__m128i *)dst, r32l);
406 _mm_store_si128((__m128i *)(dst + 16), r32h);
407 dst += stride;
408 rep = _mm_add_epi16(rep, one);
409 }
410
411 rep = _mm_set1_epi16((short)0x8000);
412 l = _mm_load_si128((const __m128i *)(left + 16));
413 for (i = 0; i < 16; ++i) {
414 l16 = _mm_shuffle_epi8(l, rep);
415 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417
418 _mm_store_si128((__m128i *)dst, r32l);
419 _mm_store_si128((__m128i *)(dst + 16), r32h);
420 dst += stride;
421 rep = _mm_add_epi16(rep, one);
422 }
423 }
424
aom_paeth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)425 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426 const uint8_t *above,
427 const uint8_t *left) {
428 const __m128i a = _mm_load_si128((const __m128i *)above);
429 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430 const __m128i zero = _mm_setzero_si128();
431 const __m128i al = _mm_unpacklo_epi8(a, zero);
432 const __m128i ah = _mm_unpackhi_epi8(a, zero);
433 const __m128i bl = _mm_unpacklo_epi8(b, zero);
434 const __m128i bh = _mm_unpackhi_epi8(b, zero);
435
436 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
437 const __m128i one = _mm_set1_epi16(1);
438 __m128i l16;
439
440 int i, j;
441 for (j = 0; j < 4; ++j) {
442 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
443 __m128i rep = _mm_set1_epi16((short)0x8000);
444 for (i = 0; i < 16; ++i) {
445 l16 = _mm_shuffle_epi8(l, rep);
446 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448
449 _mm_store_si128((__m128i *)dst, r32l);
450 _mm_store_si128((__m128i *)(dst + 16), r32h);
451 dst += stride;
452 rep = _mm_add_epi16(rep, one);
453 }
454 }
455 }
456
aom_paeth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458 const uint8_t *above,
459 const uint8_t *left) {
460 const __m128i a = _mm_load_si128((const __m128i *)above);
461 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464 const __m128i zero = _mm_setzero_si128();
465 const __m128i al = _mm_unpacklo_epi8(a, zero);
466 const __m128i ah = _mm_unpackhi_epi8(a, zero);
467 const __m128i bl = _mm_unpacklo_epi8(b, zero);
468 const __m128i bh = _mm_unpackhi_epi8(b, zero);
469 const __m128i cl = _mm_unpacklo_epi8(c, zero);
470 const __m128i ch = _mm_unpackhi_epi8(c, zero);
471 const __m128i dl = _mm_unpacklo_epi8(d, zero);
472 const __m128i dh = _mm_unpackhi_epi8(d, zero);
473
474 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
475 const __m128i one = _mm_set1_epi16(1);
476 __m128i l16;
477
478 int i, j;
479 for (j = 0; j < 2; ++j) {
480 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
481 __m128i rep = _mm_set1_epi16((short)0x8000);
482 for (i = 0; i < 16; ++i) {
483 l16 = _mm_shuffle_epi8(l, rep);
484 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488
489 _mm_store_si128((__m128i *)dst, r0);
490 _mm_store_si128((__m128i *)(dst + 16), r1);
491 _mm_store_si128((__m128i *)(dst + 32), r2);
492 _mm_store_si128((__m128i *)(dst + 48), r3);
493 dst += stride;
494 rep = _mm_add_epi16(rep, one);
495 }
496 }
497 }
498
aom_paeth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500 const uint8_t *above,
501 const uint8_t *left) {
502 const __m128i a = _mm_load_si128((const __m128i *)above);
503 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506 const __m128i zero = _mm_setzero_si128();
507 const __m128i al = _mm_unpacklo_epi8(a, zero);
508 const __m128i ah = _mm_unpackhi_epi8(a, zero);
509 const __m128i bl = _mm_unpacklo_epi8(b, zero);
510 const __m128i bh = _mm_unpackhi_epi8(b, zero);
511 const __m128i cl = _mm_unpacklo_epi8(c, zero);
512 const __m128i ch = _mm_unpackhi_epi8(c, zero);
513 const __m128i dl = _mm_unpacklo_epi8(d, zero);
514 const __m128i dh = _mm_unpackhi_epi8(d, zero);
515
516 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
517 const __m128i one = _mm_set1_epi16(1);
518 __m128i l16;
519
520 int i, j;
521 for (j = 0; j < 4; ++j) {
522 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
523 __m128i rep = _mm_set1_epi16((short)0x8000);
524 for (i = 0; i < 16; ++i) {
525 l16 = _mm_shuffle_epi8(l, rep);
526 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530
531 _mm_store_si128((__m128i *)dst, r0);
532 _mm_store_si128((__m128i *)(dst + 16), r1);
533 _mm_store_si128((__m128i *)(dst + 32), r2);
534 _mm_store_si128((__m128i *)(dst + 48), r3);
535 dst += stride;
536 rep = _mm_add_epi16(rep, one);
537 }
538 }
539 }
540
aom_paeth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)541 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542 const uint8_t *above,
543 const uint8_t *left) {
544 const __m128i a = _mm_load_si128((const __m128i *)above);
545 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548 const __m128i zero = _mm_setzero_si128();
549 const __m128i al = _mm_unpacklo_epi8(a, zero);
550 const __m128i ah = _mm_unpackhi_epi8(a, zero);
551 const __m128i bl = _mm_unpacklo_epi8(b, zero);
552 const __m128i bh = _mm_unpackhi_epi8(b, zero);
553 const __m128i cl = _mm_unpacklo_epi8(c, zero);
554 const __m128i ch = _mm_unpackhi_epi8(c, zero);
555 const __m128i dl = _mm_unpacklo_epi8(d, zero);
556 const __m128i dh = _mm_unpackhi_epi8(d, zero);
557
558 const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
559 const __m128i one = _mm_set1_epi16(1);
560 __m128i l16;
561
562 int i;
563 const __m128i l = _mm_load_si128((const __m128i *)left);
564 __m128i rep = _mm_set1_epi16((short)0x8000);
565 for (i = 0; i < 16; ++i) {
566 l16 = _mm_shuffle_epi8(l, rep);
567 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571
572 _mm_store_si128((__m128i *)dst, r0);
573 _mm_store_si128((__m128i *)(dst + 16), r1);
574 _mm_store_si128((__m128i *)(dst + 32), r2);
575 _mm_store_si128((__m128i *)(dst + 48), r3);
576 dst += stride;
577 rep = _mm_add_epi16(rep, one);
578 }
579 }
580
581 // -----------------------------------------------------------------------------
582 // SMOOTH_PRED
583
584 // pixels[0]: above and below_pred interleave vector
585 // pixels[1]: left vector
586 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)587 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588 int height, __m128i *pixels) {
589 __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
590 if (height == 4)
591 pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
592 else if (height == 8)
593 pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594 else
595 pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596
597 pixels[2] = _mm_set1_epi16((int16_t)above[3]);
598
599 const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
600 const __m128i zero = _mm_setzero_si128();
601 d = _mm_unpacklo_epi8(d, zero);
602 pixels[0] = _mm_unpacklo_epi16(d, bp);
603 }
604
605 // weight_h[0]: weight_h vector
606 // weight_h[1]: scale - weight_h vector
607 // weight_h[2]: same as [0], second half for height = 16 only
608 // weight_h[3]: same as [1], second half for height = 16 only
609 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(int height,__m128i * weight_h,__m128i * weight_w)610 static INLINE void load_weight_w4(int height, __m128i *weight_h,
611 __m128i *weight_w) {
612 const __m128i zero = _mm_setzero_si128();
613 const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
614 const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
615 weight_h[0] = _mm_unpacklo_epi8(t, zero);
616 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
618
619 if (height == 8) {
620 const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
621 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623 } else if (height == 16) {
624 const __m128i weight =
625 _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
626 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
627 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
628 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
629 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
630 }
631 }
632
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)633 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
634 const __m128i *ww, int h, uint8_t *dst,
635 ptrdiff_t stride, int second_half) {
636 const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
637 const __m128i one = _mm_set1_epi16(1);
638 const __m128i inc = _mm_set1_epi16(0x202);
639 const __m128i gat = _mm_set1_epi32(0xc080400);
640 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
641 : _mm_set1_epi16((short)0x8000);
642 __m128i d = _mm_set1_epi16(0x100);
643
644 for (int i = 0; i < h; ++i) {
645 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
646 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
647 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
648 __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
649
650 __m128i b = _mm_shuffle_epi8(pixel[1], rep);
651 b = _mm_unpacklo_epi16(b, pixel[2]);
652 __m128i sum = _mm_madd_epi16(b, ww[0]);
653
654 sum = _mm_add_epi32(s, sum);
655 sum = _mm_add_epi32(sum, round);
656 sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
657
658 sum = _mm_shuffle_epi8(sum, gat);
659 *(int *)dst = _mm_cvtsi128_si32(sum);
660 dst += stride;
661
662 rep = _mm_add_epi16(rep, one);
663 d = _mm_add_epi16(d, inc);
664 }
665 }
666
aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)667 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
668 const uint8_t *above, const uint8_t *left) {
669 __m128i pixels[3];
670 load_pixel_w4(above, left, 4, pixels);
671
672 __m128i wh[4], ww[2];
673 load_weight_w4(4, wh, ww);
674
675 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
676 }
677
aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)678 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
679 const uint8_t *above, const uint8_t *left) {
680 __m128i pixels[3];
681 load_pixel_w4(above, left, 8, pixels);
682
683 __m128i wh[4], ww[2];
684 load_weight_w4(8, wh, ww);
685
686 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
687 }
688
aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)689 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
690 const uint8_t *above,
691 const uint8_t *left) {
692 __m128i pixels[3];
693 load_pixel_w4(above, left, 16, pixels);
694
695 __m128i wh[4], ww[2];
696 load_weight_w4(16, wh, ww);
697
698 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
699 dst += stride << 3;
700 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
701 }
702
703 // pixels[0]: above and below_pred interleave vector, first half
704 // pixels[1]: above and below_pred interleave vector, second half
705 // pixels[2]: left vector
706 // pixels[3]: right_pred vector
707 // pixels[4]: above and below_pred interleave vector, first half
708 // pixels[5]: above and below_pred interleave vector, second half
709 // pixels[6]: left vector + 16
710 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)711 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
712 int height, __m128i *pixels) {
713 const __m128i zero = _mm_setzero_si128();
714 const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
715 __m128i d = _mm_loadl_epi64((const __m128i *)above);
716 d = _mm_unpacklo_epi8(d, zero);
717 pixels[0] = _mm_unpacklo_epi16(d, bp);
718 pixels[1] = _mm_unpackhi_epi16(d, bp);
719
720 pixels[3] = _mm_set1_epi16((int16_t)above[7]);
721
722 if (height == 4) {
723 pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
724 } else if (height == 8) {
725 pixels[2] = _mm_loadl_epi64((const __m128i *)left);
726 } else if (height == 16) {
727 pixels[2] = _mm_load_si128((const __m128i *)left);
728 } else {
729 pixels[2] = _mm_load_si128((const __m128i *)left);
730 pixels[4] = pixels[0];
731 pixels[5] = pixels[1];
732 pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
733 pixels[7] = pixels[3];
734 }
735 }
736
737 // weight_h[0]: weight_h vector
738 // weight_h[1]: scale - weight_h vector
739 // weight_h[2]: same as [0], offset 8
740 // weight_h[3]: same as [1], offset 8
741 // weight_h[4]: same as [0], offset 16
742 // weight_h[5]: same as [1], offset 16
743 // weight_h[6]: same as [0], offset 24
744 // weight_h[7]: same as [1], offset 24
745 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
746 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(int height,__m128i * weight_h,__m128i * weight_w)747 static INLINE void load_weight_w8(int height, __m128i *weight_h,
748 __m128i *weight_w) {
749 const __m128i zero = _mm_setzero_si128();
750 const int we_offset = height < 8 ? 0 : 4;
751 __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
752 weight_h[0] = _mm_unpacklo_epi8(we, zero);
753 const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
754 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
755
756 if (height == 4) {
757 we = _mm_srli_si128(we, 4);
758 __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
759 __m128i tmp2 = _mm_sub_epi16(d, tmp1);
760 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
761 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
762 } else {
763 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
764 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
765 }
766
767 if (height == 16) {
768 we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
769 weight_h[0] = _mm_unpacklo_epi8(we, zero);
770 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
771 weight_h[2] = _mm_unpackhi_epi8(we, zero);
772 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
773 } else if (height == 32) {
774 const __m128i weight_lo =
775 _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
776 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
777 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
778 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
779 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
780 const __m128i weight_hi =
781 _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
782 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
783 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
784 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
785 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
786 }
787 }
788
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)789 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
790 const __m128i *ww, int h, uint8_t *dst,
791 ptrdiff_t stride, int second_half) {
792 const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
793 const __m128i one = _mm_set1_epi16(1);
794 const __m128i inc = _mm_set1_epi16(0x202);
795 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
796
797 __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
798 : _mm_set1_epi16((short)0x8000);
799 __m128i d = _mm_set1_epi16(0x100);
800
801 int i;
802 for (i = 0; i < h; ++i) {
803 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
804 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
805 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
806 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
807 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
808
809 __m128i b = _mm_shuffle_epi8(pixels[2], rep);
810 b = _mm_unpacklo_epi16(b, pixels[3]);
811 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
812 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
813
814 s0 = _mm_add_epi32(s0, sum0);
815 s0 = _mm_add_epi32(s0, round);
816 s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
817
818 s1 = _mm_add_epi32(s1, sum1);
819 s1 = _mm_add_epi32(s1, round);
820 s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
821
822 sum0 = _mm_packus_epi16(s0, s1);
823 sum0 = _mm_shuffle_epi8(sum0, gat);
824 _mm_storel_epi64((__m128i *)dst, sum0);
825 dst += stride;
826
827 rep = _mm_add_epi16(rep, one);
828 d = _mm_add_epi16(d, inc);
829 }
830 }
831
aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)832 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
833 const uint8_t *above, const uint8_t *left) {
834 __m128i pixels[4];
835 load_pixel_w8(above, left, 4, pixels);
836
837 __m128i wh[4], ww[2];
838 load_weight_w8(4, wh, ww);
839
840 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
841 }
842
aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)843 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
844 const uint8_t *above, const uint8_t *left) {
845 __m128i pixels[4];
846 load_pixel_w8(above, left, 8, pixels);
847
848 __m128i wh[4], ww[2];
849 load_weight_w8(8, wh, ww);
850
851 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
852 }
853
aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)854 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
855 const uint8_t *above,
856 const uint8_t *left) {
857 __m128i pixels[4];
858 load_pixel_w8(above, left, 16, pixels);
859
860 __m128i wh[4], ww[2];
861 load_weight_w8(16, wh, ww);
862
863 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
864 dst += stride << 3;
865 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
866 }
867
aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)868 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
869 const uint8_t *above,
870 const uint8_t *left) {
871 __m128i pixels[8];
872 load_pixel_w8(above, left, 32, pixels);
873
874 __m128i wh[8], ww[2];
875 load_weight_w8(32, wh, ww);
876
877 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
878 dst += stride << 3;
879 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
880 dst += stride << 3;
881 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
882 dst += stride << 3;
883 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
884 }
885
886 // TODO(slavarnway): Visual Studio only supports restrict when /std:c11
887 // (available in 2019+) or greater is specified; __restrict can be used in that
888 // case. This should be moved to rtcd and used consistently between the
889 // function declarations and definitions to avoid warnings in Visual Studio
890 // when defining LIBAOM_RESTRICT to restrict or __restrict.
891 #if defined(_MSC_VER)
892 #define LIBAOM_RESTRICT
893 #else
894 #define LIBAOM_RESTRICT restrict
895 #endif
896
Load4(const void * src)897 static AOM_FORCE_INLINE __m128i Load4(const void *src) {
898 // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
899 // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
900 // movss instruction.
901 //
902 // Until compiler support of _mm_loadu_si32 is widespread, use of
903 // _mm_loadu_si32 is banned.
904 int val;
905 memcpy(&val, src, sizeof(val));
906 return _mm_cvtsi32_si128(val);
907 }
908
LoadLo8(const void * a)909 static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
910 return _mm_loadl_epi64((const __m128i *)(a));
911 }
912
LoadUnaligned16(const void * a)913 static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
914 return _mm_loadu_si128((const __m128i *)(a));
915 }
916
Store4(void * dst,const __m128i x)917 static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
918 const int val = _mm_cvtsi128_si32(x);
919 memcpy(dst, &val, sizeof(val));
920 }
921
StoreLo8(void * a,const __m128i v)922 static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
923 _mm_storel_epi64((__m128i *)(a), v);
924 }
925
StoreUnaligned16(void * a,const __m128i v)926 static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
927 _mm_storeu_si128((__m128i *)(a), v);
928 }
929
cvtepu8_epi16(__m128i x)930 static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
931 return _mm_unpacklo_epi8((x), _mm_setzero_si128());
932 }
933
cvtepu8_epi32(__m128i x)934 static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
935 const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
936 return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
937 }
938
cvtepu16_epi32(__m128i x)939 static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
940 return _mm_unpacklo_epi16((x), _mm_setzero_si128());
941 }
942
smooth_predictor_wxh(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column,int width,int height)943 void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
944 const uint8_t *LIBAOM_RESTRICT top_row,
945 const uint8_t *LIBAOM_RESTRICT left_column, int width,
946 int height) {
947 const uint8_t *const sm_weights_h = smooth_weights + height - 4;
948 const uint8_t *const sm_weights_w = smooth_weights + width - 4;
949 const __m128i zero = _mm_setzero_si128();
950 const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
951 const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
952 const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
953 const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
954 for (int y = 0; y < height; ++y) {
955 const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
956 const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
957 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
958 __m128i scaled_bottom_left =
959 _mm_mullo_epi16(scale_m_weights_y, bottom_left);
960 const __m128i weight_left_y =
961 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
962 scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
963 scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
964 for (int x = 0; x < width; x += 8) {
965 const __m128i top_x = LoadLo8(top_row + x);
966 const __m128i weights_x = LoadLo8(sm_weights_w + x);
967 const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
968 const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
969 const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
970
971 // Here opposite weights and pixels are multiplied, where the order of
972 // interleaving is indicated in the names.
973 __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
974 __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
975
976 // |scaled_bottom_left| is always scaled by the same weight each row, so
977 // we only derive |scaled_top_right| values here.
978 const __m128i inverted_weights_x =
979 _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
980 const __m128i scaled_top_right =
981 _mm_mullo_epi16(inverted_weights_x, top_right);
982 const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
983 const __m128i scaled_top_right_hi =
984 _mm_unpackhi_epi16(scaled_top_right, zero);
985 pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
986 pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
987 pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
988 pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
989
990 // The round value for RightShiftWithRounding was added with
991 // |scaled_bottom_left|.
992 pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
993 pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
994 const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
995 StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
996 }
997 dst += stride;
998 }
999 }
1000
aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1001 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1002 const uint8_t *above,
1003 const uint8_t *left) {
1004 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
1005 }
1006
aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1007 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1008 const uint8_t *above,
1009 const uint8_t *left) {
1010 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
1011 }
1012
aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1013 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1014 const uint8_t *above,
1015 const uint8_t *left) {
1016 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
1017 }
1018
aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1019 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1020 const uint8_t *above,
1021 const uint8_t *left) {
1022 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
1023 }
1024
aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1025 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1026 const uint8_t *above,
1027 const uint8_t *left) {
1028 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1029 }
1030
aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1031 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1032 const uint8_t *above,
1033 const uint8_t *left) {
1034 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
1035 }
1036
aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1037 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1038 const uint8_t *above,
1039 const uint8_t *left) {
1040 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
1041 }
1042
aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1043 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1044 const uint8_t *above,
1045 const uint8_t *left) {
1046 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
1047 }
1048
aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1049 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1050 const uint8_t *above,
1051 const uint8_t *left) {
1052 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
1053 }
1054
aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1055 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1056 const uint8_t *above,
1057 const uint8_t *left) {
1058 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1059 }
1060
aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1061 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1062 const uint8_t *above,
1063 const uint8_t *left) {
1064 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1065 }
1066
aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1067 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1068 const uint8_t *above,
1069 const uint8_t *left) {
1070 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
1071 }
1072
1073 // -----------------------------------------------------------------------------
1074 // Smooth horizontal/vertical helper functions.
1075
1076 // For Horizontal, pixels1 and pixels2 are the same repeated value. For
1077 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and
1078 // scaled_corner2 are the same.
write_smooth_directional_sum16(uint8_t * LIBAOM_RESTRICT dst,const __m128i pixels1,const __m128i pixels2,const __m128i weights1,const __m128i weights2,const __m128i scaled_corner1,const __m128i scaled_corner2,const __m128i round)1079 static AOM_FORCE_INLINE void write_smooth_directional_sum16(
1080 uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
1081 const __m128i weights1, const __m128i weights2,
1082 const __m128i scaled_corner1, const __m128i scaled_corner2,
1083 const __m128i round) {
1084 const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
1085 const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
1086 const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
1087 const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
1088 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1089 const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
1090 const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
1091 StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
1092 }
1093
smooth_directional_sum8(const __m128i pixels,const __m128i weights,const __m128i scaled_corner)1094 static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
1095 const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
1096 const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
1097 return _mm_add_epi16(scaled_corner, weighted_px);
1098 }
1099
write_smooth_directional_sum8(uint8_t * LIBAOM_RESTRICT dst,const __m128i * pixels,const __m128i * weights,const __m128i * scaled_corner,const __m128i * round)1100 static AOM_FORCE_INLINE void write_smooth_directional_sum8(
1101 uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
1102 const __m128i *scaled_corner, const __m128i *round) {
1103 const __m128i pred_sum =
1104 smooth_directional_sum8(*pixels, *weights, *scaled_corner);
1105 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1106 const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
1107 StoreLo8(dst, _mm_packus_epi16(pred, pred));
1108 }
1109
1110 // -----------------------------------------------------------------------------
1111 // SMOOTH_V_PRED
1112
load_smooth_vertical_pixels4(const uint8_t * LIBAOM_RESTRICT above,const uint8_t * LIBAOM_RESTRICT left,const int height,__m128i * pixels)1113 static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
1114 const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
1115 const int height, __m128i *pixels) {
1116 __m128i top = Load4(above);
1117 const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
1118 top = cvtepu8_epi16(top);
1119 pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
1120 }
1121
1122 // |weight_array| alternates weight vectors from the table with their inverted
1123 // (256-w) counterparts. This is precomputed by the compiler when the weights
1124 // table is visible to this module. Removing this visibility can cut speed by up
1125 // to half in both 4xH and 8xH transforms.
load_smooth_vertical_weights4(const uint8_t * LIBAOM_RESTRICT weight_array,const int height,__m128i * weights)1126 static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
1127 const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
1128 __m128i *weights) {
1129 const __m128i inverter = _mm_set1_epi16(256);
1130
1131 if (height == 4) {
1132 const __m128i weight = Load4(weight_array);
1133 weights[0] = cvtepu8_epi16(weight);
1134 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1135 } else if (height == 8) {
1136 const __m128i weight = LoadLo8(weight_array + 4);
1137 weights[0] = cvtepu8_epi16(weight);
1138 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1139 } else {
1140 const __m128i weight = LoadUnaligned16(weight_array + 12);
1141 const __m128i zero = _mm_setzero_si128();
1142 weights[0] = cvtepu8_epi16(weight);
1143 weights[1] = _mm_sub_epi16(inverter, weights[0]);
1144 weights[2] = _mm_unpackhi_epi8(weight, zero);
1145 weights[3] = _mm_sub_epi16(inverter, weights[2]);
1146 }
1147 }
1148
write_smooth_vertical4xh(const __m128i * pixel,const __m128i * weight,const int height,uint8_t * LIBAOM_RESTRICT dst,const ptrdiff_t stride)1149 static AOM_FORCE_INLINE void write_smooth_vertical4xh(
1150 const __m128i *pixel, const __m128i *weight, const int height,
1151 uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
1152 const __m128i pred_round = _mm_set1_epi32(128);
1153 const __m128i mask_increment = _mm_set1_epi16(0x0202);
1154 const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
1155 __m128i y_select = _mm_set1_epi16(0x0100);
1156
1157 for (int y = 0; y < height; ++y) {
1158 const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
1159 const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
1160 const __m128i alternate_weights =
1161 _mm_unpacklo_epi16(weight_y, inverted_weight_y);
1162 // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
1163 // The madd instruction yields four results of the form:
1164 // (top_row[x] * weight[y] + corner * inverted_weight[y])
1165 __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
1166 sum = _mm_add_epi32(sum, pred_round);
1167 sum = _mm_srai_epi32(sum, 8);
1168 sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
1169 Store4(dst, sum);
1170 dst += stride;
1171 y_select = _mm_add_epi16(y_select, mask_increment);
1172 }
1173 }
1174
aom_smooth_v_predictor_4x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1175 void aom_smooth_v_predictor_4x4_ssse3(
1176 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1177 const uint8_t *LIBAOM_RESTRICT top_row,
1178 const uint8_t *LIBAOM_RESTRICT left_column) {
1179 __m128i pixels;
1180 load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
1181
1182 __m128i weights[2];
1183 load_smooth_vertical_weights4(smooth_weights, 4, weights);
1184
1185 write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
1186 }
1187
aom_smooth_v_predictor_4x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1188 void aom_smooth_v_predictor_4x8_ssse3(
1189 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1190 const uint8_t *LIBAOM_RESTRICT top_row,
1191 const uint8_t *LIBAOM_RESTRICT left_column) {
1192 __m128i pixels;
1193 load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
1194
1195 __m128i weights[2];
1196 load_smooth_vertical_weights4(smooth_weights, 8, weights);
1197
1198 write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1199 }
1200
aom_smooth_v_predictor_4x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1201 void aom_smooth_v_predictor_4x16_ssse3(
1202 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1203 const uint8_t *LIBAOM_RESTRICT top_row,
1204 const uint8_t *LIBAOM_RESTRICT left_column) {
1205 __m128i pixels;
1206 load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
1207
1208 __m128i weights[4];
1209 load_smooth_vertical_weights4(smooth_weights, 16, weights);
1210
1211 write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1212 dst += stride << 3;
1213 write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
1214 }
1215
aom_smooth_v_predictor_8x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1216 void aom_smooth_v_predictor_8x4_ssse3(
1217 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1218 const uint8_t *LIBAOM_RESTRICT top_row,
1219 const uint8_t *LIBAOM_RESTRICT left_column) {
1220 const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1221 const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1222 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1223 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1224 const __m128i scaled_bottom_left =
1225 _mm_mullo_epi16(inverted_weights, bottom_left);
1226 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1227 __m128i y_select = _mm_set1_epi32(0x01000100);
1228 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1229 __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1230 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1231 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1232 &round);
1233 dst += stride;
1234 y_select = _mm_set1_epi32(0x03020302);
1235 weights_y = _mm_shuffle_epi8(weights, y_select);
1236 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1237 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1238 &round);
1239 dst += stride;
1240 y_select = _mm_set1_epi32(0x05040504);
1241 weights_y = _mm_shuffle_epi8(weights, y_select);
1242 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1243 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1244 &round);
1245 dst += stride;
1246 y_select = _mm_set1_epi32(0x07060706);
1247 weights_y = _mm_shuffle_epi8(weights, y_select);
1248 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1249 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1250 &round);
1251 }
1252
aom_smooth_v_predictor_8x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1253 void aom_smooth_v_predictor_8x8_ssse3(
1254 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1255 const uint8_t *LIBAOM_RESTRICT top_row,
1256 const uint8_t *LIBAOM_RESTRICT left_column) {
1257 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1258 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1259 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1260 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1261 const __m128i scaled_bottom_left =
1262 _mm_mullo_epi16(inverted_weights, bottom_left);
1263 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1264 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1265 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1266 const __m128i y_select = _mm_set1_epi32(y_mask);
1267 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1268 const __m128i scaled_bottom_left_y =
1269 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1270 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1271 &round);
1272 dst += stride;
1273 }
1274 }
1275
aom_smooth_v_predictor_8x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1276 void aom_smooth_v_predictor_8x16_ssse3(
1277 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1278 const uint8_t *LIBAOM_RESTRICT top_row,
1279 const uint8_t *LIBAOM_RESTRICT left_column) {
1280 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1281 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1282
1283 const __m128i weights1 = cvtepu8_epi16(weights);
1284 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
1285 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1286 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1287 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1288 const __m128i scaled_bottom_left1 =
1289 _mm_mullo_epi16(inverted_weights1, bottom_left);
1290 const __m128i scaled_bottom_left2 =
1291 _mm_mullo_epi16(inverted_weights2, bottom_left);
1292 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1293 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1294 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1295 const __m128i y_select = _mm_set1_epi32(y_mask);
1296 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1297 const __m128i scaled_bottom_left_y =
1298 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1299 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1300 &round);
1301 dst += stride;
1302 }
1303 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1304 const __m128i y_select = _mm_set1_epi32(y_mask);
1305 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1306 const __m128i scaled_bottom_left_y =
1307 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1308 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1309 &round);
1310 dst += stride;
1311 }
1312 }
1313
aom_smooth_v_predictor_8x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1314 void aom_smooth_v_predictor_8x32_ssse3(
1315 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1316 const uint8_t *LIBAOM_RESTRICT top_row,
1317 const uint8_t *LIBAOM_RESTRICT left_column) {
1318 const __m128i zero = _mm_setzero_si128();
1319 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1320 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1321 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1322 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1323 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1324 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1325 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1326 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1327 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1328 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1329 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1330 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1331 const __m128i scaled_bottom_left1 =
1332 _mm_mullo_epi16(inverted_weights1, bottom_left);
1333 const __m128i scaled_bottom_left2 =
1334 _mm_mullo_epi16(inverted_weights2, bottom_left);
1335 const __m128i scaled_bottom_left3 =
1336 _mm_mullo_epi16(inverted_weights3, bottom_left);
1337 const __m128i scaled_bottom_left4 =
1338 _mm_mullo_epi16(inverted_weights4, bottom_left);
1339 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1340 const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1341 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1342 const __m128i y_select = _mm_set1_epi32(y_mask);
1343 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1344 const __m128i scaled_bottom_left_y =
1345 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1346 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1347 &round);
1348 dst += stride;
1349 }
1350 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1351 const __m128i y_select = _mm_set1_epi32(y_mask);
1352 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1353 const __m128i scaled_bottom_left_y =
1354 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1355 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1356 &round);
1357 dst += stride;
1358 }
1359 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1360 const __m128i y_select = _mm_set1_epi32(y_mask);
1361 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1362 const __m128i scaled_bottom_left_y =
1363 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1364 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1365 &round);
1366 dst += stride;
1367 }
1368 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1369 const __m128i y_select = _mm_set1_epi32(y_mask);
1370 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1371 const __m128i scaled_bottom_left_y =
1372 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1373 write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1374 &round);
1375 dst += stride;
1376 }
1377 }
1378
aom_smooth_v_predictor_16x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1379 void aom_smooth_v_predictor_16x4_ssse3(
1380 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1381 const uint8_t *LIBAOM_RESTRICT top_row,
1382 const uint8_t *LIBAOM_RESTRICT left_column) {
1383 const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1384 const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1385 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1386 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1387 const __m128i scaled_bottom_left =
1388 _mm_mullo_epi16(inverted_weights, bottom_left);
1389 const __m128i round = _mm_set1_epi16(128);
1390 const __m128i top = LoadUnaligned16(top_row);
1391 const __m128i top_lo = cvtepu8_epi16(top);
1392 const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1393
1394 __m128i y_select = _mm_set1_epi32(0x01000100);
1395 __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1396 __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1397 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1398 scaled_bottom_left_y, scaled_bottom_left_y,
1399 round);
1400 dst += stride;
1401 y_select = _mm_set1_epi32(0x03020302);
1402 weights_y = _mm_shuffle_epi8(weights, y_select);
1403 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1404 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1405 scaled_bottom_left_y, scaled_bottom_left_y,
1406 round);
1407 dst += stride;
1408 y_select = _mm_set1_epi32(0x05040504);
1409 weights_y = _mm_shuffle_epi8(weights, y_select);
1410 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1411 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1412 scaled_bottom_left_y, scaled_bottom_left_y,
1413 round);
1414 dst += stride;
1415 y_select = _mm_set1_epi32(0x07060706);
1416 weights_y = _mm_shuffle_epi8(weights, y_select);
1417 scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1418 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1419 scaled_bottom_left_y, scaled_bottom_left_y,
1420 round);
1421 }
1422
aom_smooth_v_predictor_16x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1423 void aom_smooth_v_predictor_16x8_ssse3(
1424 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1425 const uint8_t *LIBAOM_RESTRICT top_row,
1426 const uint8_t *LIBAOM_RESTRICT left_column) {
1427 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1428 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1429 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1430 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1431 const __m128i scaled_bottom_left =
1432 _mm_mullo_epi16(inverted_weights, bottom_left);
1433 const __m128i round = _mm_set1_epi16(128);
1434 const __m128i top = LoadUnaligned16(top_row);
1435 const __m128i top_lo = cvtepu8_epi16(top);
1436 const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1437 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1438 const __m128i y_select = _mm_set1_epi32(y_mask);
1439 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1440 const __m128i scaled_bottom_left_y =
1441 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1442 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1443 scaled_bottom_left_y, scaled_bottom_left_y,
1444 round);
1445 dst += stride;
1446 }
1447 }
1448
aom_smooth_v_predictor_16x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1449 void aom_smooth_v_predictor_16x16_ssse3(
1450 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1451 const uint8_t *LIBAOM_RESTRICT top_row,
1452 const uint8_t *LIBAOM_RESTRICT left_column) {
1453 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1454 const __m128i zero = _mm_setzero_si128();
1455 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1456 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1457 const __m128i weights_lo = cvtepu8_epi16(weights);
1458 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1459 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1460 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1461 const __m128i scaled_bottom_left_lo =
1462 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1463 const __m128i scaled_bottom_left_hi =
1464 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1465 const __m128i round = _mm_set1_epi16(128);
1466
1467 const __m128i top = LoadUnaligned16(top_row);
1468 const __m128i top_lo = cvtepu8_epi16(top);
1469 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1470 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1471 const __m128i y_select = _mm_set1_epi32(y_mask);
1472 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1473 const __m128i scaled_bottom_left_y =
1474 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1475 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1476 scaled_bottom_left_y, scaled_bottom_left_y,
1477 round);
1478 dst += stride;
1479 }
1480 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1481 const __m128i y_select = _mm_set1_epi32(y_mask);
1482 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1483 const __m128i scaled_bottom_left_y =
1484 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1485 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1486 scaled_bottom_left_y, scaled_bottom_left_y,
1487 round);
1488 dst += stride;
1489 }
1490 }
1491
aom_smooth_v_predictor_16x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1492 void aom_smooth_v_predictor_16x32_ssse3(
1493 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1494 const uint8_t *LIBAOM_RESTRICT top_row,
1495 const uint8_t *LIBAOM_RESTRICT left_column) {
1496 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1497 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1498 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1499 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1500 const __m128i zero = _mm_setzero_si128();
1501 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1502 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1503 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1504 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1505 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1506 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1507 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1508 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1509 const __m128i scaled_bottom_left1 =
1510 _mm_mullo_epi16(inverted_weights1, bottom_left);
1511 const __m128i scaled_bottom_left2 =
1512 _mm_mullo_epi16(inverted_weights2, bottom_left);
1513 const __m128i scaled_bottom_left3 =
1514 _mm_mullo_epi16(inverted_weights3, bottom_left);
1515 const __m128i scaled_bottom_left4 =
1516 _mm_mullo_epi16(inverted_weights4, bottom_left);
1517 const __m128i round = _mm_set1_epi16(128);
1518
1519 const __m128i top = LoadUnaligned16(top_row);
1520 const __m128i top_lo = cvtepu8_epi16(top);
1521 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1522 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1523 const __m128i y_select = _mm_set1_epi32(y_mask);
1524 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1525 const __m128i scaled_bottom_left_y =
1526 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1527 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1528 scaled_bottom_left_y, scaled_bottom_left_y,
1529 round);
1530 dst += stride;
1531 }
1532 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1533 const __m128i y_select = _mm_set1_epi32(y_mask);
1534 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1535 const __m128i scaled_bottom_left_y =
1536 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1537 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1538 scaled_bottom_left_y, scaled_bottom_left_y,
1539 round);
1540 dst += stride;
1541 }
1542 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1543 const __m128i y_select = _mm_set1_epi32(y_mask);
1544 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1545 const __m128i scaled_bottom_left_y =
1546 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1547 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1548 scaled_bottom_left_y, scaled_bottom_left_y,
1549 round);
1550 dst += stride;
1551 }
1552 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1553 const __m128i y_select = _mm_set1_epi32(y_mask);
1554 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1555 const __m128i scaled_bottom_left_y =
1556 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1557 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1558 scaled_bottom_left_y, scaled_bottom_left_y,
1559 round);
1560 dst += stride;
1561 }
1562 }
1563
aom_smooth_v_predictor_16x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1564 void aom_smooth_v_predictor_16x64_ssse3(
1565 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1566 const uint8_t *LIBAOM_RESTRICT top_row,
1567 const uint8_t *LIBAOM_RESTRICT left_column) {
1568 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1569 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1570 const __m128i round = _mm_set1_epi16(128);
1571 const __m128i zero = _mm_setzero_si128();
1572 const __m128i top = LoadUnaligned16(top_row);
1573 const __m128i top_lo = cvtepu8_epi16(top);
1574 const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1575 const uint8_t *weights_base_ptr = smooth_weights + 60;
1576 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1577 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1578 const __m128i weights_lo = cvtepu8_epi16(weights);
1579 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1580 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1581 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1582 const __m128i scaled_bottom_left_lo =
1583 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1584 const __m128i scaled_bottom_left_hi =
1585 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1586
1587 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1588 const __m128i y_select = _mm_set1_epi32(y_mask);
1589 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1590 const __m128i scaled_bottom_left_y =
1591 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1592 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1593 scaled_bottom_left_y, scaled_bottom_left_y,
1594 round);
1595 dst += stride;
1596 }
1597 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1598 const __m128i y_select = _mm_set1_epi32(y_mask);
1599 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1600 const __m128i scaled_bottom_left_y =
1601 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1602 write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1603 scaled_bottom_left_y, scaled_bottom_left_y,
1604 round);
1605 dst += stride;
1606 }
1607 }
1608 }
1609
aom_smooth_v_predictor_32x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1610 void aom_smooth_v_predictor_32x8_ssse3(
1611 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1612 const uint8_t *LIBAOM_RESTRICT top_row,
1613 const uint8_t *LIBAOM_RESTRICT left_column) {
1614 const __m128i zero = _mm_setzero_si128();
1615 const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1616 const __m128i top_lo = LoadUnaligned16(top_row);
1617 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1618 const __m128i top1 = cvtepu8_epi16(top_lo);
1619 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1620 const __m128i top3 = cvtepu8_epi16(top_hi);
1621 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1622 __m128i scale = _mm_set1_epi16(256);
1623 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1624 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1625 const __m128i scaled_bottom_left =
1626 _mm_mullo_epi16(inverted_weights, bottom_left);
1627 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1628 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1629 __m128i y_select = _mm_set1_epi32(y_mask);
1630 const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1631 const __m128i scaled_bottom_left_y =
1632 _mm_shuffle_epi8(scaled_bottom_left, y_select);
1633 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1634 scaled_bottom_left_y, scaled_bottom_left_y,
1635 round);
1636 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1637 scaled_bottom_left_y, scaled_bottom_left_y,
1638 round);
1639 dst += stride;
1640 }
1641 }
1642
aom_smooth_v_predictor_32x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1643 void aom_smooth_v_predictor_32x16_ssse3(
1644 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1645 const uint8_t *LIBAOM_RESTRICT top_row,
1646 const uint8_t *LIBAOM_RESTRICT left_column) {
1647 const __m128i zero = _mm_setzero_si128();
1648 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1649 const __m128i top_lo = LoadUnaligned16(top_row);
1650 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1651 const __m128i top1 = cvtepu8_epi16(top_lo);
1652 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1653 const __m128i top3 = cvtepu8_epi16(top_hi);
1654 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1655 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1656 const __m128i weights1 = cvtepu8_epi16(weights);
1657 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1658 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1659 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1660 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1661 const __m128i scaled_bottom_left1 =
1662 _mm_mullo_epi16(inverted_weights1, bottom_left);
1663 const __m128i scaled_bottom_left2 =
1664 _mm_mullo_epi16(inverted_weights2, bottom_left);
1665 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1666 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1667 __m128i y_select = _mm_set1_epi32(y_mask);
1668 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1669 const __m128i scaled_bottom_left_y =
1670 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1671 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1672 scaled_bottom_left_y, scaled_bottom_left_y,
1673 round);
1674 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1675 scaled_bottom_left_y, scaled_bottom_left_y,
1676 round);
1677 dst += stride;
1678 }
1679 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1680 __m128i y_select = _mm_set1_epi32(y_mask);
1681 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1682 const __m128i scaled_bottom_left_y =
1683 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1684 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1685 scaled_bottom_left_y, scaled_bottom_left_y,
1686 round);
1687 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1688 scaled_bottom_left_y, scaled_bottom_left_y,
1689 round);
1690 dst += stride;
1691 }
1692 }
1693
aom_smooth_v_predictor_32x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1694 void aom_smooth_v_predictor_32x32_ssse3(
1695 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1696 const uint8_t *LIBAOM_RESTRICT top_row,
1697 const uint8_t *LIBAOM_RESTRICT left_column) {
1698 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1699 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1700 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1701 const __m128i zero = _mm_setzero_si128();
1702 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1703 const __m128i top_lo = LoadUnaligned16(top_row);
1704 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1705 const __m128i top1 = cvtepu8_epi16(top_lo);
1706 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1707 const __m128i top3 = cvtepu8_epi16(top_hi);
1708 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1709 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1710 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1711 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1712 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1713 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1714 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1715 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1716 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1717 const __m128i scaled_bottom_left1 =
1718 _mm_mullo_epi16(inverted_weights1, bottom_left);
1719 const __m128i scaled_bottom_left2 =
1720 _mm_mullo_epi16(inverted_weights2, bottom_left);
1721 const __m128i scaled_bottom_left3 =
1722 _mm_mullo_epi16(inverted_weights3, bottom_left);
1723 const __m128i scaled_bottom_left4 =
1724 _mm_mullo_epi16(inverted_weights4, bottom_left);
1725 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1726 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1727 const __m128i y_select = _mm_set1_epi32(y_mask);
1728 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1729 const __m128i scaled_bottom_left_y =
1730 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1731 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1732 scaled_bottom_left_y, scaled_bottom_left_y,
1733 round);
1734 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1735 scaled_bottom_left_y, scaled_bottom_left_y,
1736 round);
1737 dst += stride;
1738 }
1739 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1740 const __m128i y_select = _mm_set1_epi32(y_mask);
1741 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1742 const __m128i scaled_bottom_left_y =
1743 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1744 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1745 scaled_bottom_left_y, scaled_bottom_left_y,
1746 round);
1747 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1748 scaled_bottom_left_y, scaled_bottom_left_y,
1749 round);
1750 dst += stride;
1751 }
1752 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1753 const __m128i y_select = _mm_set1_epi32(y_mask);
1754 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1755 const __m128i scaled_bottom_left_y =
1756 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1757 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1758 scaled_bottom_left_y, scaled_bottom_left_y,
1759 round);
1760 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1761 scaled_bottom_left_y, scaled_bottom_left_y,
1762 round);
1763 dst += stride;
1764 }
1765 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1766 const __m128i y_select = _mm_set1_epi32(y_mask);
1767 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1768 const __m128i scaled_bottom_left_y =
1769 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1770 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1771 scaled_bottom_left_y, scaled_bottom_left_y,
1772 round);
1773 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1774 scaled_bottom_left_y, scaled_bottom_left_y,
1775 round);
1776 dst += stride;
1777 }
1778 }
1779
aom_smooth_v_predictor_32x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1780 void aom_smooth_v_predictor_32x64_ssse3(
1781 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1782 const uint8_t *LIBAOM_RESTRICT top_row,
1783 const uint8_t *LIBAOM_RESTRICT left_column) {
1784 const __m128i zero = _mm_setzero_si128();
1785 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1786 const __m128i top_lo = LoadUnaligned16(top_row);
1787 const __m128i top_hi = LoadUnaligned16(top_row + 16);
1788 const __m128i top1 = cvtepu8_epi16(top_lo);
1789 const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1790 const __m128i top3 = cvtepu8_epi16(top_hi);
1791 const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1792 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1793 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1794 const uint8_t *weights_base_ptr = smooth_weights + 60;
1795 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1796 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1797 const __m128i weights_lo = cvtepu8_epi16(weights);
1798 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1799 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1800 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1801 const __m128i scaled_bottom_left_lo =
1802 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1803 const __m128i scaled_bottom_left_hi =
1804 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1805
1806 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1807 const __m128i y_select = _mm_set1_epi32(y_mask);
1808 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1809 const __m128i scaled_bottom_left_y =
1810 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1811 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1812 scaled_bottom_left_y, scaled_bottom_left_y,
1813 round);
1814 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1815 scaled_bottom_left_y, scaled_bottom_left_y,
1816 round);
1817 dst += stride;
1818 }
1819 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1820 const __m128i y_select = _mm_set1_epi32(y_mask);
1821 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1822 const __m128i scaled_bottom_left_y =
1823 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1824 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1825 scaled_bottom_left_y, scaled_bottom_left_y,
1826 round);
1827 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1828 scaled_bottom_left_y, scaled_bottom_left_y,
1829 round);
1830 dst += stride;
1831 }
1832 }
1833 }
1834
aom_smooth_v_predictor_64x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1835 void aom_smooth_v_predictor_64x16_ssse3(
1836 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1837 const uint8_t *LIBAOM_RESTRICT top_row,
1838 const uint8_t *LIBAOM_RESTRICT left_column) {
1839 const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1840 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1841 const __m128i zero = _mm_setzero_si128();
1842 const __m128i top_lolo = LoadUnaligned16(top_row);
1843 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1844 const __m128i top1 = cvtepu8_epi16(top_lolo);
1845 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1846 const __m128i top3 = cvtepu8_epi16(top_lohi);
1847 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1848
1849 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1850 const __m128i weights1 = cvtepu8_epi16(weights);
1851 const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1852 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1853 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1854 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1855 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1856 const __m128i top5 = cvtepu8_epi16(top_hilo);
1857 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1858 const __m128i top7 = cvtepu8_epi16(top_hihi);
1859 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1860 const __m128i scaled_bottom_left1 =
1861 _mm_mullo_epi16(inverted_weights1, bottom_left);
1862 const __m128i scaled_bottom_left2 =
1863 _mm_mullo_epi16(inverted_weights2, bottom_left);
1864 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1865 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1866 const __m128i y_select = _mm_set1_epi32(y_mask);
1867 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1868 const __m128i scaled_bottom_left_y =
1869 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1870 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1871 scaled_bottom_left_y, scaled_bottom_left_y,
1872 round);
1873 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1874 scaled_bottom_left_y, scaled_bottom_left_y,
1875 round);
1876 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1877 scaled_bottom_left_y, scaled_bottom_left_y,
1878 round);
1879 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1880 scaled_bottom_left_y, scaled_bottom_left_y,
1881 round);
1882 dst += stride;
1883 }
1884 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1885 const __m128i y_select = _mm_set1_epi32(y_mask);
1886 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1887 const __m128i scaled_bottom_left_y =
1888 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1889 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1890 scaled_bottom_left_y, scaled_bottom_left_y,
1891 round);
1892 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1893 scaled_bottom_left_y, scaled_bottom_left_y,
1894 round);
1895 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1896 scaled_bottom_left_y, scaled_bottom_left_y,
1897 round);
1898 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1899 scaled_bottom_left_y, scaled_bottom_left_y,
1900 round);
1901 dst += stride;
1902 }
1903 }
1904
aom_smooth_v_predictor_64x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1905 void aom_smooth_v_predictor_64x32_ssse3(
1906 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1907 const uint8_t *LIBAOM_RESTRICT top_row,
1908 const uint8_t *LIBAOM_RESTRICT left_column) {
1909 const __m128i zero = _mm_setzero_si128();
1910 const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1911 const __m128i top_lolo = LoadUnaligned16(top_row);
1912 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1913 const __m128i top1 = cvtepu8_epi16(top_lolo);
1914 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1915 const __m128i top3 = cvtepu8_epi16(top_lohi);
1916 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1917 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1918 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1919 const __m128i top5 = cvtepu8_epi16(top_hilo);
1920 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1921 const __m128i top7 = cvtepu8_epi16(top_hihi);
1922 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1923 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1924 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1925 const __m128i weights1 = cvtepu8_epi16(weights_lo);
1926 const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1927 const __m128i weights3 = cvtepu8_epi16(weights_hi);
1928 const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1929 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1930 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1931 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1932 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1933 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1934 const __m128i scaled_bottom_left1 =
1935 _mm_mullo_epi16(inverted_weights1, bottom_left);
1936 const __m128i scaled_bottom_left2 =
1937 _mm_mullo_epi16(inverted_weights2, bottom_left);
1938 const __m128i scaled_bottom_left3 =
1939 _mm_mullo_epi16(inverted_weights3, bottom_left);
1940 const __m128i scaled_bottom_left4 =
1941 _mm_mullo_epi16(inverted_weights4, bottom_left);
1942 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1943
1944 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1945 const __m128i y_select = _mm_set1_epi32(y_mask);
1946 const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1947 const __m128i scaled_bottom_left_y =
1948 _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1949 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1950 scaled_bottom_left_y, scaled_bottom_left_y,
1951 round);
1952 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1953 scaled_bottom_left_y, scaled_bottom_left_y,
1954 round);
1955 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1956 scaled_bottom_left_y, scaled_bottom_left_y,
1957 round);
1958 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1959 scaled_bottom_left_y, scaled_bottom_left_y,
1960 round);
1961 dst += stride;
1962 }
1963 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1964 const __m128i y_select = _mm_set1_epi32(y_mask);
1965 const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1966 const __m128i scaled_bottom_left_y =
1967 _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1968 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1969 scaled_bottom_left_y, scaled_bottom_left_y,
1970 round);
1971 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1972 scaled_bottom_left_y, scaled_bottom_left_y,
1973 round);
1974 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1975 scaled_bottom_left_y, scaled_bottom_left_y,
1976 round);
1977 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1978 scaled_bottom_left_y, scaled_bottom_left_y,
1979 round);
1980 dst += stride;
1981 }
1982 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1983 const __m128i y_select = _mm_set1_epi32(y_mask);
1984 const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1985 const __m128i scaled_bottom_left_y =
1986 _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1987 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1988 scaled_bottom_left_y, scaled_bottom_left_y,
1989 round);
1990 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1991 scaled_bottom_left_y, scaled_bottom_left_y,
1992 round);
1993 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1994 scaled_bottom_left_y, scaled_bottom_left_y,
1995 round);
1996 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1997 scaled_bottom_left_y, scaled_bottom_left_y,
1998 round);
1999 dst += stride;
2000 }
2001 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2002 const __m128i y_select = _mm_set1_epi32(y_mask);
2003 const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2004 const __m128i scaled_bottom_left_y =
2005 _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2006 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2007 scaled_bottom_left_y, scaled_bottom_left_y,
2008 round);
2009 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2010 scaled_bottom_left_y, scaled_bottom_left_y,
2011 round);
2012 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2013 scaled_bottom_left_y, scaled_bottom_left_y,
2014 round);
2015 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2016 scaled_bottom_left_y, scaled_bottom_left_y,
2017 round);
2018 dst += stride;
2019 }
2020 }
2021
aom_smooth_v_predictor_64x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2022 void aom_smooth_v_predictor_64x64_ssse3(
2023 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2024 const uint8_t *LIBAOM_RESTRICT top_row,
2025 const uint8_t *LIBAOM_RESTRICT left_column) {
2026 const __m128i zero = _mm_setzero_si128();
2027 const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
2028 const __m128i top_lolo = LoadUnaligned16(top_row);
2029 const __m128i top_lohi = LoadUnaligned16(top_row + 16);
2030 const __m128i top1 = cvtepu8_epi16(top_lolo);
2031 const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2032 const __m128i top3 = cvtepu8_epi16(top_lohi);
2033 const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2034 const __m128i top_hilo = LoadUnaligned16(top_row + 32);
2035 const __m128i top_hihi = LoadUnaligned16(top_row + 48);
2036 const __m128i top5 = cvtepu8_epi16(top_hilo);
2037 const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2038 const __m128i top7 = cvtepu8_epi16(top_hihi);
2039 const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2040 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2041 const __m128i round = _mm_set1_epi16(128);
2042 const uint8_t *weights_base_ptr = smooth_weights + 60;
2043 for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2044 const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2045 const __m128i weights_lo = cvtepu8_epi16(weights);
2046 const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2047 const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2048 const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2049 const __m128i scaled_bottom_left_lo =
2050 _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2051 const __m128i scaled_bottom_left_hi =
2052 _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2053 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2054 const __m128i y_select = _mm_set1_epi32(y_mask);
2055 const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2056 const __m128i scaled_bottom_left_y =
2057 _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2058 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2059 scaled_bottom_left_y, scaled_bottom_left_y,
2060 round);
2061 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2062 scaled_bottom_left_y, scaled_bottom_left_y,
2063 round);
2064 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2065 scaled_bottom_left_y, scaled_bottom_left_y,
2066 round);
2067 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2068 scaled_bottom_left_y, scaled_bottom_left_y,
2069 round);
2070 dst += stride;
2071 }
2072 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2073 const __m128i y_select = _mm_set1_epi32(y_mask);
2074 const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2075 const __m128i scaled_bottom_left_y =
2076 _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2077 write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2078 scaled_bottom_left_y, scaled_bottom_left_y,
2079 round);
2080 write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2081 scaled_bottom_left_y, scaled_bottom_left_y,
2082 round);
2083 write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2084 scaled_bottom_left_y, scaled_bottom_left_y,
2085 round);
2086 write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2087 scaled_bottom_left_y, scaled_bottom_left_y,
2088 round);
2089 dst += stride;
2090 }
2091 }
2092 }
2093
2094 // -----------------------------------------------------------------------------
2095 // SMOOTH_H_PRED
write_smooth_horizontal_sum4(uint8_t * LIBAOM_RESTRICT dst,const __m128i * left_y,const __m128i * weights,const __m128i * scaled_top_right,const __m128i * round)2096 static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
2097 uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
2098 const __m128i *scaled_top_right, const __m128i *round) {
2099 const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
2100 const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
2101 // Equivalent to RightShiftWithRounding(pred[x][y], 8).
2102 const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
2103 const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
2104 Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
2105 }
2106
aom_smooth_h_predictor_4x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2107 void aom_smooth_h_predictor_4x4_ssse3(
2108 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2109 const uint8_t *LIBAOM_RESTRICT top_row,
2110 const uint8_t *LIBAOM_RESTRICT left_column) {
2111 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2112 const __m128i left = cvtepu8_epi32(Load4(left_column));
2113 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2114 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2115 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2116 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2117 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2118 __m128i left_y = _mm_shuffle_epi32(left, 0);
2119 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2120 &round);
2121 dst += stride;
2122 left_y = _mm_shuffle_epi32(left, 0x55);
2123 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2124 &round);
2125 dst += stride;
2126 left_y = _mm_shuffle_epi32(left, 0xaa);
2127 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2128 &round);
2129 dst += stride;
2130 left_y = _mm_shuffle_epi32(left, 0xff);
2131 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2132 &round);
2133 }
2134
aom_smooth_h_predictor_4x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2135 void aom_smooth_h_predictor_4x8_ssse3(
2136 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2137 const uint8_t *LIBAOM_RESTRICT top_row,
2138 const uint8_t *LIBAOM_RESTRICT left_column) {
2139 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2140 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2141 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2142 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2143 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2144 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2145 __m128i left = cvtepu8_epi32(Load4(left_column));
2146 __m128i left_y = _mm_shuffle_epi32(left, 0);
2147 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2148 &round);
2149 dst += stride;
2150 left_y = _mm_shuffle_epi32(left, 0x55);
2151 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2152 &round);
2153 dst += stride;
2154 left_y = _mm_shuffle_epi32(left, 0xaa);
2155 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2156 &round);
2157 dst += stride;
2158 left_y = _mm_shuffle_epi32(left, 0xff);
2159 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2160 &round);
2161 dst += stride;
2162
2163 left = cvtepu8_epi32(Load4(left_column + 4));
2164 left_y = _mm_shuffle_epi32(left, 0);
2165 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2166 &round);
2167 dst += stride;
2168 left_y = _mm_shuffle_epi32(left, 0x55);
2169 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2170 &round);
2171 dst += stride;
2172 left_y = _mm_shuffle_epi32(left, 0xaa);
2173 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2174 &round);
2175 dst += stride;
2176 left_y = _mm_shuffle_epi32(left, 0xff);
2177 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2178 &round);
2179 }
2180
aom_smooth_h_predictor_4x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2181 void aom_smooth_h_predictor_4x16_ssse3(
2182 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2183 const uint8_t *LIBAOM_RESTRICT top_row,
2184 const uint8_t *LIBAOM_RESTRICT left_column) {
2185 const __m128i top_right = _mm_set1_epi32(top_row[3]);
2186 const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2187 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2188 const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2189 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2190 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2191 __m128i left = cvtepu8_epi32(Load4(left_column));
2192 __m128i left_y = _mm_shuffle_epi32(left, 0);
2193 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2194 &round);
2195 dst += stride;
2196 left_y = _mm_shuffle_epi32(left, 0x55);
2197 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2198 &round);
2199 dst += stride;
2200 left_y = _mm_shuffle_epi32(left, 0xaa);
2201 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2202 &round);
2203 dst += stride;
2204 left_y = _mm_shuffle_epi32(left, 0xff);
2205 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2206 &round);
2207 dst += stride;
2208
2209 left = cvtepu8_epi32(Load4(left_column + 4));
2210 left_y = _mm_shuffle_epi32(left, 0);
2211 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2212 &round);
2213 dst += stride;
2214 left_y = _mm_shuffle_epi32(left, 0x55);
2215 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2216 &round);
2217 dst += stride;
2218 left_y = _mm_shuffle_epi32(left, 0xaa);
2219 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2220 &round);
2221 dst += stride;
2222 left_y = _mm_shuffle_epi32(left, 0xff);
2223 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2224 &round);
2225 dst += stride;
2226
2227 left = cvtepu8_epi32(Load4(left_column + 8));
2228 left_y = _mm_shuffle_epi32(left, 0);
2229 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2230 &round);
2231 dst += stride;
2232 left_y = _mm_shuffle_epi32(left, 0x55);
2233 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2234 &round);
2235 dst += stride;
2236 left_y = _mm_shuffle_epi32(left, 0xaa);
2237 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2238 &round);
2239 dst += stride;
2240 left_y = _mm_shuffle_epi32(left, 0xff);
2241 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2242 &round);
2243 dst += stride;
2244
2245 left = cvtepu8_epi32(Load4(left_column + 12));
2246 left_y = _mm_shuffle_epi32(left, 0);
2247 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2248 &round);
2249 dst += stride;
2250 left_y = _mm_shuffle_epi32(left, 0x55);
2251 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2252 &round);
2253 dst += stride;
2254 left_y = _mm_shuffle_epi32(left, 0xaa);
2255 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2256 &round);
2257 dst += stride;
2258 left_y = _mm_shuffle_epi32(left, 0xff);
2259 write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2260 &round);
2261 }
2262
2263 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
2264 // |pixels| is a segment of the top row or the whole top row, and |weights| is
2265 // repeated.
aom_smooth_h_predictor_8x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2266 void aom_smooth_h_predictor_8x4_ssse3(
2267 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2268 const uint8_t *LIBAOM_RESTRICT top_row,
2269 const uint8_t *LIBAOM_RESTRICT left_column) {
2270 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2271 const __m128i left = cvtepu8_epi16(Load4(left_column));
2272 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2273 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2274 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2275 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2276 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2277 __m128i y_select = _mm_set1_epi32(0x01000100);
2278 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2279 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2280 &round);
2281 dst += stride;
2282 y_select = _mm_set1_epi32(0x03020302);
2283 left_y = _mm_shuffle_epi8(left, y_select);
2284 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2285 &round);
2286 dst += stride;
2287 y_select = _mm_set1_epi32(0x05040504);
2288 left_y = _mm_shuffle_epi8(left, y_select);
2289 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2290 &round);
2291 dst += stride;
2292 y_select = _mm_set1_epi32(0x07060706);
2293 left_y = _mm_shuffle_epi8(left, y_select);
2294 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2295 &round);
2296 }
2297
aom_smooth_h_predictor_8x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2298 void aom_smooth_h_predictor_8x8_ssse3(
2299 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2300 const uint8_t *LIBAOM_RESTRICT top_row,
2301 const uint8_t *LIBAOM_RESTRICT left_column) {
2302 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2303 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2304 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2305 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2306 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2307 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2308 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2309 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2310 const __m128i y_select = _mm_set1_epi32(y_mask);
2311 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2312 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2313 &round);
2314 dst += stride;
2315 }
2316 }
2317
aom_smooth_h_predictor_8x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2318 void aom_smooth_h_predictor_8x16_ssse3(
2319 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2320 const uint8_t *LIBAOM_RESTRICT top_row,
2321 const uint8_t *LIBAOM_RESTRICT left_column) {
2322 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2323 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2324 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2325 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2326 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2327 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2328 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2329 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2330 const __m128i y_select = _mm_set1_epi32(y_mask);
2331 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2332 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2333 &round);
2334 dst += stride;
2335 }
2336 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2337 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2338 const __m128i y_select = _mm_set1_epi32(y_mask);
2339 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2340 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2341 &round);
2342 dst += stride;
2343 }
2344 }
2345
aom_smooth_h_predictor_8x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2346 void aom_smooth_h_predictor_8x32_ssse3(
2347 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2348 const uint8_t *LIBAOM_RESTRICT top_row,
2349 const uint8_t *LIBAOM_RESTRICT left_column) {
2350 const __m128i top_right = _mm_set1_epi16(top_row[7]);
2351 const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2352 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2353 const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2354 const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2355 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2356 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2357 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2358 const __m128i y_select = _mm_set1_epi32(y_mask);
2359 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2360 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2361 &round);
2362 dst += stride;
2363 }
2364 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2365 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2366 const __m128i y_select = _mm_set1_epi32(y_mask);
2367 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2368 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2369 &round);
2370 dst += stride;
2371 }
2372 left = cvtepu8_epi16(LoadLo8(left_column + 16));
2373 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2374 const __m128i y_select = _mm_set1_epi32(y_mask);
2375 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2376 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2377 &round);
2378 dst += stride;
2379 }
2380 left = cvtepu8_epi16(LoadLo8(left_column + 24));
2381 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2382 const __m128i y_select = _mm_set1_epi32(y_mask);
2383 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2384 write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2385 &round);
2386 dst += stride;
2387 }
2388 }
2389
aom_smooth_h_predictor_16x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2390 void aom_smooth_h_predictor_16x4_ssse3(
2391 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2392 const uint8_t *LIBAOM_RESTRICT top_row,
2393 const uint8_t *LIBAOM_RESTRICT left_column) {
2394 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2395 const __m128i left = cvtepu8_epi16(Load4(left_column));
2396 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2397 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2398 const __m128i weights1 = cvtepu8_epi16(weights);
2399 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2400 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2401 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2402 const __m128i scaled_top_right1 =
2403 _mm_mullo_epi16(inverted_weights1, top_right);
2404 const __m128i scaled_top_right2 =
2405 _mm_mullo_epi16(inverted_weights2, top_right);
2406 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2407 __m128i y_mask = _mm_set1_epi32(0x01000100);
2408 __m128i left_y = _mm_shuffle_epi8(left, y_mask);
2409 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2410 scaled_top_right1, scaled_top_right2, round);
2411 dst += stride;
2412 y_mask = _mm_set1_epi32(0x03020302);
2413 left_y = _mm_shuffle_epi8(left, y_mask);
2414 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2415 scaled_top_right1, scaled_top_right2, round);
2416 dst += stride;
2417 y_mask = _mm_set1_epi32(0x05040504);
2418 left_y = _mm_shuffle_epi8(left, y_mask);
2419 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2420 scaled_top_right1, scaled_top_right2, round);
2421 dst += stride;
2422 y_mask = _mm_set1_epi32(0x07060706);
2423 left_y = _mm_shuffle_epi8(left, y_mask);
2424 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2425 scaled_top_right1, scaled_top_right2, round);
2426 }
2427
aom_smooth_h_predictor_16x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2428 void aom_smooth_h_predictor_16x8_ssse3(
2429 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2430 const uint8_t *LIBAOM_RESTRICT top_row,
2431 const uint8_t *LIBAOM_RESTRICT left_column) {
2432 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2433 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2434 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2435 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2436 const __m128i weights1 = cvtepu8_epi16(weights);
2437 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2438 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2439 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2440 const __m128i scaled_top_right1 =
2441 _mm_mullo_epi16(inverted_weights1, top_right);
2442 const __m128i scaled_top_right2 =
2443 _mm_mullo_epi16(inverted_weights2, top_right);
2444 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2445 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2446 const __m128i y_select = _mm_set1_epi32(y_mask);
2447 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2448 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2449 scaled_top_right1, scaled_top_right2, round);
2450 dst += stride;
2451 }
2452 }
2453
aom_smooth_h_predictor_16x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2454 void aom_smooth_h_predictor_16x16_ssse3(
2455 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2456 const uint8_t *LIBAOM_RESTRICT top_row,
2457 const uint8_t *LIBAOM_RESTRICT left_column) {
2458 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2459 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2460 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2461 const __m128i weights1 = cvtepu8_epi16(weights);
2462 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2463 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2464 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2465 const __m128i scaled_top_right1 =
2466 _mm_mullo_epi16(inverted_weights1, top_right);
2467 const __m128i scaled_top_right2 =
2468 _mm_mullo_epi16(inverted_weights2, top_right);
2469 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2470 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2471 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2472 const __m128i y_select = _mm_set1_epi32(y_mask);
2473 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2474 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2475 scaled_top_right1, scaled_top_right2, round);
2476 dst += stride;
2477 }
2478 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2479 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2480 const __m128i y_select = _mm_set1_epi32(y_mask);
2481 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2482 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2483 scaled_top_right1, scaled_top_right2, round);
2484 dst += stride;
2485 }
2486 }
2487
aom_smooth_h_predictor_16x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2488 void aom_smooth_h_predictor_16x32_ssse3(
2489 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2490 const uint8_t *LIBAOM_RESTRICT top_row,
2491 const uint8_t *LIBAOM_RESTRICT left_column) {
2492 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2493 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2494 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2495 const __m128i weights1 = cvtepu8_epi16(weights);
2496 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2497 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2498 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2499 const __m128i scaled_top_right1 =
2500 _mm_mullo_epi16(inverted_weights1, top_right);
2501 const __m128i scaled_top_right2 =
2502 _mm_mullo_epi16(inverted_weights2, top_right);
2503 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2504 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2505 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2506 const __m128i y_select = _mm_set1_epi32(y_mask);
2507 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2508 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2509 scaled_top_right1, scaled_top_right2, round);
2510 dst += stride;
2511 }
2512 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2513 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2514 const __m128i y_select = _mm_set1_epi32(y_mask);
2515 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2516 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2517 scaled_top_right1, scaled_top_right2, round);
2518 dst += stride;
2519 }
2520 left = cvtepu8_epi16(LoadLo8(left_column + 16));
2521 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2522 const __m128i y_select = _mm_set1_epi32(y_mask);
2523 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2524 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2525 scaled_top_right1, scaled_top_right2, round);
2526 dst += stride;
2527 }
2528 left = cvtepu8_epi16(LoadLo8(left_column + 24));
2529 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2530 const __m128i y_select = _mm_set1_epi32(y_mask);
2531 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2532 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2533 scaled_top_right1, scaled_top_right2, round);
2534 dst += stride;
2535 }
2536 }
2537
aom_smooth_h_predictor_16x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2538 void aom_smooth_h_predictor_16x64_ssse3(
2539 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2540 const uint8_t *LIBAOM_RESTRICT top_row,
2541 const uint8_t *LIBAOM_RESTRICT left_column) {
2542 const __m128i top_right = _mm_set1_epi16(top_row[15]);
2543 const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2544 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2545 const __m128i weights1 = cvtepu8_epi16(weights);
2546 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2547 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2548 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2549 const __m128i scaled_top_right1 =
2550 _mm_mullo_epi16(inverted_weights1, top_right);
2551 const __m128i scaled_top_right2 =
2552 _mm_mullo_epi16(inverted_weights2, top_right);
2553 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2554 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2555 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2556 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2557 const __m128i y_select = _mm_set1_epi32(y_mask);
2558 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2559 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2560 scaled_top_right1, scaled_top_right2,
2561 round);
2562 dst += stride;
2563 }
2564 }
2565 }
2566
aom_smooth_h_predictor_32x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2567 void aom_smooth_h_predictor_32x8_ssse3(
2568 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2569 const uint8_t *LIBAOM_RESTRICT top_row,
2570 const uint8_t *LIBAOM_RESTRICT left_column) {
2571 const __m128i top_right = _mm_set1_epi16(top_row[31]);
2572 const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2573 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2574 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2575 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2576 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2577 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2578 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2579 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2580 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2581 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2582 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2583 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2584 const __m128i scaled_top_right1 =
2585 _mm_mullo_epi16(inverted_weights1, top_right);
2586 const __m128i scaled_top_right2 =
2587 _mm_mullo_epi16(inverted_weights2, top_right);
2588 const __m128i scaled_top_right3 =
2589 _mm_mullo_epi16(inverted_weights3, top_right);
2590 const __m128i scaled_top_right4 =
2591 _mm_mullo_epi16(inverted_weights4, top_right);
2592 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2593 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2594 __m128i y_select = _mm_set1_epi32(y_mask);
2595 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2596 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2597 scaled_top_right1, scaled_top_right2, round);
2598 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2599 scaled_top_right3, scaled_top_right4, round);
2600 dst += stride;
2601 }
2602 }
2603
aom_smooth_h_predictor_32x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2604 void aom_smooth_h_predictor_32x16_ssse3(
2605 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2606 const uint8_t *LIBAOM_RESTRICT top_row,
2607 const uint8_t *LIBAOM_RESTRICT left_column) {
2608 const __m128i top_right = _mm_set1_epi16(top_row[31]);
2609 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2610 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2611 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2612 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2613 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2614 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2615 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2616 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2617 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2618 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2619 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2620 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2621 const __m128i scaled_top_right1 =
2622 _mm_mullo_epi16(inverted_weights1, top_right);
2623 const __m128i scaled_top_right2 =
2624 _mm_mullo_epi16(inverted_weights2, top_right);
2625 const __m128i scaled_top_right3 =
2626 _mm_mullo_epi16(inverted_weights3, top_right);
2627 const __m128i scaled_top_right4 =
2628 _mm_mullo_epi16(inverted_weights4, top_right);
2629 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2630 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2631 __m128i y_select = _mm_set1_epi32(y_mask);
2632 __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2633 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2634 scaled_top_right1, scaled_top_right2, round);
2635 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2636 scaled_top_right3, scaled_top_right4, round);
2637 dst += stride;
2638 }
2639 const __m128i left2 =
2640 cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
2641 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2642 __m128i y_select = _mm_set1_epi32(y_mask);
2643 __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2644 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2645 scaled_top_right1, scaled_top_right2, round);
2646 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2647 scaled_top_right3, scaled_top_right4, round);
2648 dst += stride;
2649 }
2650 }
2651
aom_smooth_h_predictor_32x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2652 void aom_smooth_h_predictor_32x32_ssse3(
2653 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2654 const uint8_t *LIBAOM_RESTRICT top_row,
2655 const uint8_t *LIBAOM_RESTRICT left_column) {
2656 const __m128i top_right = _mm_set1_epi16(top_row[31]);
2657 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2658 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2659 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2660 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2661 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2662 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2663 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2664 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2665 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2666 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2667 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2668 const __m128i scaled_top_right1 =
2669 _mm_mullo_epi16(inverted_weights1, top_right);
2670 const __m128i scaled_top_right2 =
2671 _mm_mullo_epi16(inverted_weights2, top_right);
2672 const __m128i scaled_top_right3 =
2673 _mm_mullo_epi16(inverted_weights3, top_right);
2674 const __m128i scaled_top_right4 =
2675 _mm_mullo_epi16(inverted_weights4, top_right);
2676 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2677 __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2678 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2679 __m128i y_select = _mm_set1_epi32(y_mask);
2680 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2681 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2682 scaled_top_right1, scaled_top_right2, round);
2683 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2684 scaled_top_right3, scaled_top_right4, round);
2685 dst += stride;
2686 }
2687 left = cvtepu8_epi16(LoadLo8(left_column + 8));
2688 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2689 __m128i y_select = _mm_set1_epi32(y_mask);
2690 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2691 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2692 scaled_top_right1, scaled_top_right2, round);
2693 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2694 scaled_top_right3, scaled_top_right4, round);
2695 dst += stride;
2696 }
2697 left = cvtepu8_epi16(LoadLo8(left_column + 16));
2698 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2699 __m128i y_select = _mm_set1_epi32(y_mask);
2700 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2701 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2702 scaled_top_right1, scaled_top_right2, round);
2703 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2704 scaled_top_right3, scaled_top_right4, round);
2705 dst += stride;
2706 }
2707 left = cvtepu8_epi16(LoadLo8(left_column + 24));
2708 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2709 __m128i y_select = _mm_set1_epi32(y_mask);
2710 __m128i left_y = _mm_shuffle_epi8(left, y_select);
2711 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2712 scaled_top_right1, scaled_top_right2, round);
2713 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2714 scaled_top_right3, scaled_top_right4, round);
2715 dst += stride;
2716 }
2717 }
2718
aom_smooth_h_predictor_32x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2719 void aom_smooth_h_predictor_32x64_ssse3(
2720 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2721 const uint8_t *LIBAOM_RESTRICT top_row,
2722 const uint8_t *LIBAOM_RESTRICT left_column) {
2723 const __m128i top_right = _mm_set1_epi16(top_row[31]);
2724 const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2725 const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2726 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2727 const __m128i weights1 = cvtepu8_epi16(weights_lo);
2728 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2729 const __m128i weights3 = cvtepu8_epi16(weights_hi);
2730 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2731 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2732 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2733 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2734 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2735 const __m128i scaled_top_right1 =
2736 _mm_mullo_epi16(inverted_weights1, top_right);
2737 const __m128i scaled_top_right2 =
2738 _mm_mullo_epi16(inverted_weights2, top_right);
2739 const __m128i scaled_top_right3 =
2740 _mm_mullo_epi16(inverted_weights3, top_right);
2741 const __m128i scaled_top_right4 =
2742 _mm_mullo_epi16(inverted_weights4, top_right);
2743 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2744 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2745 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2746 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2747 const __m128i y_select = _mm_set1_epi32(y_mask);
2748 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2749 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2750 scaled_top_right1, scaled_top_right2,
2751 round);
2752 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2753 weights4, scaled_top_right3,
2754 scaled_top_right4, round);
2755 dst += stride;
2756 }
2757 }
2758 }
2759
aom_smooth_h_predictor_64x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2760 void aom_smooth_h_predictor_64x16_ssse3(
2761 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2762 const uint8_t *LIBAOM_RESTRICT top_row,
2763 const uint8_t *LIBAOM_RESTRICT left_column) {
2764 const __m128i top_right = _mm_set1_epi16(top_row[63]);
2765 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2766 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2767 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2768 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2769 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2770 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2771 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2772 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2773 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2774 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2775 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2776 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2777 const __m128i scaled_top_right1 =
2778 _mm_mullo_epi16(inverted_weights1, top_right);
2779 const __m128i scaled_top_right2 =
2780 _mm_mullo_epi16(inverted_weights2, top_right);
2781 const __m128i scaled_top_right3 =
2782 _mm_mullo_epi16(inverted_weights3, top_right);
2783 const __m128i scaled_top_right4 =
2784 _mm_mullo_epi16(inverted_weights4, top_right);
2785 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2786 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2787 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2788 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2789 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2790 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2791 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2792 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2793 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2794 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2795 const __m128i scaled_top_right5 =
2796 _mm_mullo_epi16(inverted_weights5, top_right);
2797 const __m128i scaled_top_right6 =
2798 _mm_mullo_epi16(inverted_weights6, top_right);
2799 const __m128i scaled_top_right7 =
2800 _mm_mullo_epi16(inverted_weights7, top_right);
2801 const __m128i scaled_top_right8 =
2802 _mm_mullo_epi16(inverted_weights8, top_right);
2803 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2804 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2805 __m128i y_select = _mm_set1_epi32(y_mask);
2806 __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2807 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2808 scaled_top_right1, scaled_top_right2, round);
2809 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2810 scaled_top_right3, scaled_top_right4, round);
2811 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2812 scaled_top_right5, scaled_top_right6, round);
2813 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2814 scaled_top_right7, scaled_top_right8, round);
2815 dst += stride;
2816 }
2817 const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2818 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2819 __m128i y_select = _mm_set1_epi32(y_mask);
2820 __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2821 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2822 scaled_top_right1, scaled_top_right2, round);
2823 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2824 scaled_top_right3, scaled_top_right4, round);
2825 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2826 scaled_top_right5, scaled_top_right6, round);
2827 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2828 scaled_top_right7, scaled_top_right8, round);
2829 dst += stride;
2830 }
2831 }
2832
aom_smooth_h_predictor_64x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2833 void aom_smooth_h_predictor_64x32_ssse3(
2834 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2835 const uint8_t *LIBAOM_RESTRICT top_row,
2836 const uint8_t *LIBAOM_RESTRICT left_column) {
2837 const __m128i top_right = _mm_set1_epi16(top_row[63]);
2838 const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2839 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2840 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2841 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2842 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2843 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2844 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2845 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2846 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2847 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2848 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2849 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2850 const __m128i scaled_top_right1 =
2851 _mm_mullo_epi16(inverted_weights1, top_right);
2852 const __m128i scaled_top_right2 =
2853 _mm_mullo_epi16(inverted_weights2, top_right);
2854 const __m128i scaled_top_right3 =
2855 _mm_mullo_epi16(inverted_weights3, top_right);
2856 const __m128i scaled_top_right4 =
2857 _mm_mullo_epi16(inverted_weights4, top_right);
2858 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2859 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2860 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2861 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2862 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2863 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2864 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2865 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2866 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2867 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2868 const __m128i scaled_top_right5 =
2869 _mm_mullo_epi16(inverted_weights5, top_right);
2870 const __m128i scaled_top_right6 =
2871 _mm_mullo_epi16(inverted_weights6, top_right);
2872 const __m128i scaled_top_right7 =
2873 _mm_mullo_epi16(inverted_weights7, top_right);
2874 const __m128i scaled_top_right8 =
2875 _mm_mullo_epi16(inverted_weights8, top_right);
2876 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2877 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2878 const __m128i y_select = _mm_set1_epi32(y_mask);
2879 const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2880 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2881 scaled_top_right1, scaled_top_right2, round);
2882 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2883 scaled_top_right3, scaled_top_right4, round);
2884 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2885 scaled_top_right5, scaled_top_right6, round);
2886 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2887 scaled_top_right7, scaled_top_right8, round);
2888 dst += stride;
2889 }
2890 const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2891 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2892 const __m128i y_select = _mm_set1_epi32(y_mask);
2893 const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2894 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2895 scaled_top_right1, scaled_top_right2, round);
2896 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2897 scaled_top_right3, scaled_top_right4, round);
2898 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2899 scaled_top_right5, scaled_top_right6, round);
2900 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2901 scaled_top_right7, scaled_top_right8, round);
2902 dst += stride;
2903 }
2904 const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
2905 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2906 const __m128i y_select = _mm_set1_epi32(y_mask);
2907 const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
2908 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2909 scaled_top_right1, scaled_top_right2, round);
2910 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2911 scaled_top_right3, scaled_top_right4, round);
2912 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2913 scaled_top_right5, scaled_top_right6, round);
2914 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2915 scaled_top_right7, scaled_top_right8, round);
2916 dst += stride;
2917 }
2918 const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
2919 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2920 const __m128i y_select = _mm_set1_epi32(y_mask);
2921 const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
2922 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2923 scaled_top_right1, scaled_top_right2, round);
2924 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2925 scaled_top_right3, scaled_top_right4, round);
2926 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2927 scaled_top_right5, scaled_top_right6, round);
2928 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2929 scaled_top_right7, scaled_top_right8, round);
2930 dst += stride;
2931 }
2932 }
2933
aom_smooth_h_predictor_64x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2934 void aom_smooth_h_predictor_64x64_ssse3(
2935 uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2936 const uint8_t *LIBAOM_RESTRICT top_row,
2937 const uint8_t *LIBAOM_RESTRICT left_column) {
2938 const __m128i top_right = _mm_set1_epi16(top_row[63]);
2939 const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2940 const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2941 const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2942 const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2943 const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2944 const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2945 const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2946 const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2947 const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2948 const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2949 const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2950 const __m128i scaled_top_right1 =
2951 _mm_mullo_epi16(inverted_weights1, top_right);
2952 const __m128i scaled_top_right2 =
2953 _mm_mullo_epi16(inverted_weights2, top_right);
2954 const __m128i scaled_top_right3 =
2955 _mm_mullo_epi16(inverted_weights3, top_right);
2956 const __m128i scaled_top_right4 =
2957 _mm_mullo_epi16(inverted_weights4, top_right);
2958 const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2959 const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2960 const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2961 const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2962 const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2963 const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2964 const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2965 const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2966 const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2967 const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2968 const __m128i scaled_top_right5 =
2969 _mm_mullo_epi16(inverted_weights5, top_right);
2970 const __m128i scaled_top_right6 =
2971 _mm_mullo_epi16(inverted_weights6, top_right);
2972 const __m128i scaled_top_right7 =
2973 _mm_mullo_epi16(inverted_weights7, top_right);
2974 const __m128i scaled_top_right8 =
2975 _mm_mullo_epi16(inverted_weights8, top_right);
2976 const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2977 for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2978 const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2979 for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2980 const __m128i y_select = _mm_set1_epi32(y_mask);
2981 const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2982 write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2983 scaled_top_right1, scaled_top_right2,
2984 round);
2985 write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2986 weights4, scaled_top_right3,
2987 scaled_top_right4, round);
2988 write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
2989 weights6, scaled_top_right5,
2990 scaled_top_right6, round);
2991 write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
2992 weights8, scaled_top_right7,
2993 scaled_top_right8, round);
2994 dst += stride;
2995 }
2996 }
2997 }
2998