1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <tmmintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
16 #include "aom_dsp/intrapred_common.h"
17
18 // -----------------------------------------------------------------------------
19 // PAETH_PRED
20
21 // Return 8 16-bit pixels in one row
paeth_8x1_pred(const __m128i * left,const __m128i * top,const __m128i * topleft)22 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23 const __m128i *topleft) {
24 const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25
26 __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27 __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28 __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29
30 __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31 mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32 __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33
34 pl = _mm_andnot_si128(mask1, *left);
35
36 ptl = _mm_and_si128(mask2, *topleft);
37 pt = _mm_andnot_si128(mask2, *top);
38 pt = _mm_or_si128(pt, ptl);
39 pt = _mm_and_si128(mask1, pt);
40
41 return _mm_or_si128(pl, pt);
42 }
43
aom_paeth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45 const uint8_t *above, const uint8_t *left) {
46 __m128i l = _mm_loadl_epi64((const __m128i *)left);
47 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48 const __m128i zero = _mm_setzero_si128();
49 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
51 __m128i rep = _mm_set1_epi16(0x8000);
52 const __m128i one = _mm_set1_epi16(1);
53
54 int i;
55 for (i = 0; i < 4; ++i) {
56 const __m128i l16 = _mm_shuffle_epi8(l, rep);
57 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58
59 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60 dst += stride;
61 rep = _mm_add_epi16(rep, one);
62 }
63 }
64
aom_paeth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66 const uint8_t *above, const uint8_t *left) {
67 __m128i l = _mm_loadl_epi64((const __m128i *)left);
68 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69 const __m128i zero = _mm_setzero_si128();
70 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
72 __m128i rep = _mm_set1_epi16(0x8000);
73 const __m128i one = _mm_set1_epi16(1);
74
75 int i;
76 for (i = 0; i < 8; ++i) {
77 const __m128i l16 = _mm_shuffle_epi8(l, rep);
78 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79
80 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81 dst += stride;
82 rep = _mm_add_epi16(rep, one);
83 }
84 }
85
aom_paeth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)86 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87 const uint8_t *above, const uint8_t *left) {
88 __m128i l = _mm_load_si128((const __m128i *)left);
89 const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
90 const __m128i zero = _mm_setzero_si128();
91 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
92 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
93 __m128i rep = _mm_set1_epi16(0x8000);
94 const __m128i one = _mm_set1_epi16(1);
95
96 for (int i = 0; i < 16; ++i) {
97 const __m128i l16 = _mm_shuffle_epi8(l, rep);
98 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99
100 *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
101 dst += stride;
102 rep = _mm_add_epi16(rep, one);
103 }
104 }
105
aom_paeth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)106 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107 const uint8_t *above, const uint8_t *left) {
108 __m128i l = _mm_loadl_epi64((const __m128i *)left);
109 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110 const __m128i zero = _mm_setzero_si128();
111 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
113 __m128i rep = _mm_set1_epi16(0x8000);
114 const __m128i one = _mm_set1_epi16(1);
115
116 int i;
117 for (i = 0; i < 4; ++i) {
118 const __m128i l16 = _mm_shuffle_epi8(l, rep);
119 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120
121 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122 dst += stride;
123 rep = _mm_add_epi16(rep, one);
124 }
125 }
126
aom_paeth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)127 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128 const uint8_t *above, const uint8_t *left) {
129 __m128i l = _mm_loadl_epi64((const __m128i *)left);
130 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131 const __m128i zero = _mm_setzero_si128();
132 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
134 __m128i rep = _mm_set1_epi16(0x8000);
135 const __m128i one = _mm_set1_epi16(1);
136
137 int i;
138 for (i = 0; i < 8; ++i) {
139 const __m128i l16 = _mm_shuffle_epi8(l, rep);
140 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141
142 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143 dst += stride;
144 rep = _mm_add_epi16(rep, one);
145 }
146 }
147
aom_paeth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)148 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149 const uint8_t *above, const uint8_t *left) {
150 __m128i l = _mm_load_si128((const __m128i *)left);
151 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152 const __m128i zero = _mm_setzero_si128();
153 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
154 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
155 __m128i rep = _mm_set1_epi16(0x8000);
156 const __m128i one = _mm_set1_epi16(1);
157
158 int i;
159 for (i = 0; i < 16; ++i) {
160 const __m128i l16 = _mm_shuffle_epi8(l, rep);
161 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162
163 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164 dst += stride;
165 rep = _mm_add_epi16(rep, one);
166 }
167 }
168
aom_paeth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170 const uint8_t *above, const uint8_t *left) {
171 const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172 const __m128i zero = _mm_setzero_si128();
173 const __m128i t16 = _mm_unpacklo_epi8(t, zero);
174 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
175 const __m128i one = _mm_set1_epi16(1);
176
177 for (int j = 0; j < 2; ++j) {
178 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
179 __m128i rep = _mm_set1_epi16(0x8000);
180 for (int i = 0; i < 16; ++i) {
181 const __m128i l16 = _mm_shuffle_epi8(l, rep);
182 const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183
184 _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185 dst += stride;
186 rep = _mm_add_epi16(rep, one);
187 }
188 }
189 }
190
191 // Return 16 8-bit pixels in one row
paeth_16x1_pred(const __m128i * left,const __m128i * top0,const __m128i * top1,const __m128i * topleft)192 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193 const __m128i *top1,
194 const __m128i *topleft) {
195 const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196 const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197 return _mm_packus_epi16(p0, p1);
198 }
199
aom_paeth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)200 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201 const uint8_t *above, const uint8_t *left) {
202 __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
203 const __m128i t = _mm_load_si128((const __m128i *)above);
204 const __m128i zero = _mm_setzero_si128();
205 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
207 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
208 __m128i rep = _mm_set1_epi16(0x8000);
209 const __m128i one = _mm_set1_epi16(1);
210
211 for (int i = 0; i < 4; ++i) {
212 const __m128i l16 = _mm_shuffle_epi8(l, rep);
213 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214
215 _mm_store_si128((__m128i *)dst, row);
216 dst += stride;
217 rep = _mm_add_epi16(rep, one);
218 }
219 }
220
aom_paeth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)221 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222 const uint8_t *above, const uint8_t *left) {
223 __m128i l = _mm_loadl_epi64((const __m128i *)left);
224 const __m128i t = _mm_load_si128((const __m128i *)above);
225 const __m128i zero = _mm_setzero_si128();
226 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
228 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
229 __m128i rep = _mm_set1_epi16(0x8000);
230 const __m128i one = _mm_set1_epi16(1);
231
232 int i;
233 for (i = 0; i < 8; ++i) {
234 const __m128i l16 = _mm_shuffle_epi8(l, rep);
235 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236
237 _mm_store_si128((__m128i *)dst, row);
238 dst += stride;
239 rep = _mm_add_epi16(rep, one);
240 }
241 }
242
aom_paeth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)243 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244 const uint8_t *above,
245 const uint8_t *left) {
246 __m128i l = _mm_load_si128((const __m128i *)left);
247 const __m128i t = _mm_load_si128((const __m128i *)above);
248 const __m128i zero = _mm_setzero_si128();
249 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
251 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
252 __m128i rep = _mm_set1_epi16(0x8000);
253 const __m128i one = _mm_set1_epi16(1);
254
255 int i;
256 for (i = 0; i < 16; ++i) {
257 const __m128i l16 = _mm_shuffle_epi8(l, rep);
258 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259
260 _mm_store_si128((__m128i *)dst, row);
261 dst += stride;
262 rep = _mm_add_epi16(rep, one);
263 }
264 }
265
aom_paeth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)266 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267 const uint8_t *above,
268 const uint8_t *left) {
269 __m128i l = _mm_load_si128((const __m128i *)left);
270 const __m128i t = _mm_load_si128((const __m128i *)above);
271 const __m128i zero = _mm_setzero_si128();
272 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
274 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
275 __m128i rep = _mm_set1_epi16(0x8000);
276 const __m128i one = _mm_set1_epi16(1);
277 __m128i l16;
278
279 int i;
280 for (i = 0; i < 16; ++i) {
281 l16 = _mm_shuffle_epi8(l, rep);
282 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283
284 _mm_store_si128((__m128i *)dst, row);
285 dst += stride;
286 rep = _mm_add_epi16(rep, one);
287 }
288
289 l = _mm_load_si128((const __m128i *)(left + 16));
290 rep = _mm_set1_epi16(0x8000);
291 for (i = 0; i < 16; ++i) {
292 l16 = _mm_shuffle_epi8(l, rep);
293 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294
295 _mm_store_si128((__m128i *)dst, row);
296 dst += stride;
297 rep = _mm_add_epi16(rep, one);
298 }
299 }
300
aom_paeth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302 const uint8_t *above,
303 const uint8_t *left) {
304 const __m128i t = _mm_load_si128((const __m128i *)above);
305 const __m128i zero = _mm_setzero_si128();
306 const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307 const __m128i top1 = _mm_unpackhi_epi8(t, zero);
308 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
309 const __m128i one = _mm_set1_epi16(1);
310
311 for (int j = 0; j < 4; ++j) {
312 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
313 __m128i rep = _mm_set1_epi16(0x8000);
314 for (int i = 0; i < 16; ++i) {
315 const __m128i l16 = _mm_shuffle_epi8(l, rep);
316 const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317 _mm_store_si128((__m128i *)dst, row);
318 dst += stride;
319 rep = _mm_add_epi16(rep, one);
320 }
321 }
322 }
323
aom_paeth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)324 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325 const uint8_t *above, const uint8_t *left) {
326 const __m128i a = _mm_load_si128((const __m128i *)above);
327 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328 const __m128i zero = _mm_setzero_si128();
329 const __m128i al = _mm_unpacklo_epi8(a, zero);
330 const __m128i ah = _mm_unpackhi_epi8(a, zero);
331 const __m128i bl = _mm_unpacklo_epi8(b, zero);
332 const __m128i bh = _mm_unpackhi_epi8(b, zero);
333
334 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
335 __m128i rep = _mm_set1_epi16(0x8000);
336 const __m128i one = _mm_set1_epi16(1);
337 const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338 __m128i l16;
339
340 for (int i = 0; i < 8; ++i) {
341 l16 = _mm_shuffle_epi8(l, rep);
342 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344
345 _mm_store_si128((__m128i *)dst, r32l);
346 _mm_store_si128((__m128i *)(dst + 16), r32h);
347 dst += stride;
348 rep = _mm_add_epi16(rep, one);
349 }
350 }
351
aom_paeth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)352 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353 const uint8_t *above,
354 const uint8_t *left) {
355 const __m128i a = _mm_load_si128((const __m128i *)above);
356 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357 const __m128i zero = _mm_setzero_si128();
358 const __m128i al = _mm_unpacklo_epi8(a, zero);
359 const __m128i ah = _mm_unpackhi_epi8(a, zero);
360 const __m128i bl = _mm_unpacklo_epi8(b, zero);
361 const __m128i bh = _mm_unpackhi_epi8(b, zero);
362
363 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
364 __m128i rep = _mm_set1_epi16(0x8000);
365 const __m128i one = _mm_set1_epi16(1);
366 __m128i l = _mm_load_si128((const __m128i *)left);
367 __m128i l16;
368
369 int i;
370 for (i = 0; i < 16; ++i) {
371 l16 = _mm_shuffle_epi8(l, rep);
372 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374
375 _mm_store_si128((__m128i *)dst, r32l);
376 _mm_store_si128((__m128i *)(dst + 16), r32h);
377 dst += stride;
378 rep = _mm_add_epi16(rep, one);
379 }
380 }
381
aom_paeth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)382 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383 const uint8_t *above,
384 const uint8_t *left) {
385 const __m128i a = _mm_load_si128((const __m128i *)above);
386 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387 const __m128i zero = _mm_setzero_si128();
388 const __m128i al = _mm_unpacklo_epi8(a, zero);
389 const __m128i ah = _mm_unpackhi_epi8(a, zero);
390 const __m128i bl = _mm_unpacklo_epi8(b, zero);
391 const __m128i bh = _mm_unpackhi_epi8(b, zero);
392
393 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
394 __m128i rep = _mm_set1_epi16(0x8000);
395 const __m128i one = _mm_set1_epi16(1);
396 __m128i l = _mm_load_si128((const __m128i *)left);
397 __m128i l16;
398
399 int i;
400 for (i = 0; i < 16; ++i) {
401 l16 = _mm_shuffle_epi8(l, rep);
402 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404
405 _mm_store_si128((__m128i *)dst, r32l);
406 _mm_store_si128((__m128i *)(dst + 16), r32h);
407 dst += stride;
408 rep = _mm_add_epi16(rep, one);
409 }
410
411 rep = _mm_set1_epi16(0x8000);
412 l = _mm_load_si128((const __m128i *)(left + 16));
413 for (i = 0; i < 16; ++i) {
414 l16 = _mm_shuffle_epi8(l, rep);
415 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417
418 _mm_store_si128((__m128i *)dst, r32l);
419 _mm_store_si128((__m128i *)(dst + 16), r32h);
420 dst += stride;
421 rep = _mm_add_epi16(rep, one);
422 }
423 }
424
aom_paeth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)425 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426 const uint8_t *above,
427 const uint8_t *left) {
428 const __m128i a = _mm_load_si128((const __m128i *)above);
429 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430 const __m128i zero = _mm_setzero_si128();
431 const __m128i al = _mm_unpacklo_epi8(a, zero);
432 const __m128i ah = _mm_unpackhi_epi8(a, zero);
433 const __m128i bl = _mm_unpacklo_epi8(b, zero);
434 const __m128i bh = _mm_unpackhi_epi8(b, zero);
435
436 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
437 const __m128i one = _mm_set1_epi16(1);
438 __m128i l16;
439
440 int i, j;
441 for (j = 0; j < 4; ++j) {
442 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
443 __m128i rep = _mm_set1_epi16(0x8000);
444 for (i = 0; i < 16; ++i) {
445 l16 = _mm_shuffle_epi8(l, rep);
446 const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447 const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448
449 _mm_store_si128((__m128i *)dst, r32l);
450 _mm_store_si128((__m128i *)(dst + 16), r32h);
451 dst += stride;
452 rep = _mm_add_epi16(rep, one);
453 }
454 }
455 }
456
aom_paeth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458 const uint8_t *above,
459 const uint8_t *left) {
460 const __m128i a = _mm_load_si128((const __m128i *)above);
461 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464 const __m128i zero = _mm_setzero_si128();
465 const __m128i al = _mm_unpacklo_epi8(a, zero);
466 const __m128i ah = _mm_unpackhi_epi8(a, zero);
467 const __m128i bl = _mm_unpacklo_epi8(b, zero);
468 const __m128i bh = _mm_unpackhi_epi8(b, zero);
469 const __m128i cl = _mm_unpacklo_epi8(c, zero);
470 const __m128i ch = _mm_unpackhi_epi8(c, zero);
471 const __m128i dl = _mm_unpacklo_epi8(d, zero);
472 const __m128i dh = _mm_unpackhi_epi8(d, zero);
473
474 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
475 const __m128i one = _mm_set1_epi16(1);
476 __m128i l16;
477
478 int i, j;
479 for (j = 0; j < 2; ++j) {
480 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
481 __m128i rep = _mm_set1_epi16(0x8000);
482 for (i = 0; i < 16; ++i) {
483 l16 = _mm_shuffle_epi8(l, rep);
484 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488
489 _mm_store_si128((__m128i *)dst, r0);
490 _mm_store_si128((__m128i *)(dst + 16), r1);
491 _mm_store_si128((__m128i *)(dst + 32), r2);
492 _mm_store_si128((__m128i *)(dst + 48), r3);
493 dst += stride;
494 rep = _mm_add_epi16(rep, one);
495 }
496 }
497 }
498
aom_paeth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500 const uint8_t *above,
501 const uint8_t *left) {
502 const __m128i a = _mm_load_si128((const __m128i *)above);
503 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506 const __m128i zero = _mm_setzero_si128();
507 const __m128i al = _mm_unpacklo_epi8(a, zero);
508 const __m128i ah = _mm_unpackhi_epi8(a, zero);
509 const __m128i bl = _mm_unpacklo_epi8(b, zero);
510 const __m128i bh = _mm_unpackhi_epi8(b, zero);
511 const __m128i cl = _mm_unpacklo_epi8(c, zero);
512 const __m128i ch = _mm_unpackhi_epi8(c, zero);
513 const __m128i dl = _mm_unpacklo_epi8(d, zero);
514 const __m128i dh = _mm_unpackhi_epi8(d, zero);
515
516 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
517 const __m128i one = _mm_set1_epi16(1);
518 __m128i l16;
519
520 int i, j;
521 for (j = 0; j < 4; ++j) {
522 const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
523 __m128i rep = _mm_set1_epi16(0x8000);
524 for (i = 0; i < 16; ++i) {
525 l16 = _mm_shuffle_epi8(l, rep);
526 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530
531 _mm_store_si128((__m128i *)dst, r0);
532 _mm_store_si128((__m128i *)(dst + 16), r1);
533 _mm_store_si128((__m128i *)(dst + 32), r2);
534 _mm_store_si128((__m128i *)(dst + 48), r3);
535 dst += stride;
536 rep = _mm_add_epi16(rep, one);
537 }
538 }
539 }
540
aom_paeth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)541 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542 const uint8_t *above,
543 const uint8_t *left) {
544 const __m128i a = _mm_load_si128((const __m128i *)above);
545 const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546 const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547 const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548 const __m128i zero = _mm_setzero_si128();
549 const __m128i al = _mm_unpacklo_epi8(a, zero);
550 const __m128i ah = _mm_unpackhi_epi8(a, zero);
551 const __m128i bl = _mm_unpacklo_epi8(b, zero);
552 const __m128i bh = _mm_unpackhi_epi8(b, zero);
553 const __m128i cl = _mm_unpacklo_epi8(c, zero);
554 const __m128i ch = _mm_unpackhi_epi8(c, zero);
555 const __m128i dl = _mm_unpacklo_epi8(d, zero);
556 const __m128i dh = _mm_unpackhi_epi8(d, zero);
557
558 const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
559 const __m128i one = _mm_set1_epi16(1);
560 __m128i l16;
561
562 int i;
563 const __m128i l = _mm_load_si128((const __m128i *)left);
564 __m128i rep = _mm_set1_epi16(0x8000);
565 for (i = 0; i < 16; ++i) {
566 l16 = _mm_shuffle_epi8(l, rep);
567 const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568 const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569 const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570 const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571
572 _mm_store_si128((__m128i *)dst, r0);
573 _mm_store_si128((__m128i *)(dst + 16), r1);
574 _mm_store_si128((__m128i *)(dst + 32), r2);
575 _mm_store_si128((__m128i *)(dst + 48), r3);
576 dst += stride;
577 rep = _mm_add_epi16(rep, one);
578 }
579 }
580
581 // -----------------------------------------------------------------------------
582 // SMOOTH_PRED
583
584 // pixels[0]: above and below_pred interleave vector
585 // pixels[1]: left vector
586 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)587 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588 int height, __m128i *pixels) {
589 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
590 if (height == 4)
591 pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
592 else if (height == 8)
593 pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594 else
595 pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596
597 pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
598
599 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
600 const __m128i zero = _mm_setzero_si128();
601 d = _mm_unpacklo_epi8(d, zero);
602 pixels[0] = _mm_unpacklo_epi16(d, bp);
603 }
604
605 // weight_h[0]: weight_h vector
606 // weight_h[1]: scale - weight_h vector
607 // weight_h[2]: same as [0], second half for height = 16 only
608 // weight_h[3]: same as [1], second half for height = 16 only
609 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(const uint8_t * weight_array,int height,__m128i * weight_h,__m128i * weight_w)610 static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
611 __m128i *weight_h, __m128i *weight_w) {
612 const __m128i zero = _mm_setzero_si128();
613 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
614 const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
615 weight_h[0] = _mm_unpacklo_epi8(t, zero);
616 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
618
619 if (height == 8) {
620 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
621 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623 } else if (height == 16) {
624 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
625 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
626 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
627 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
628 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
629 }
630 }
631
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)632 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
633 const __m128i *ww, int h, uint8_t *dst,
634 ptrdiff_t stride, int second_half) {
635 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
636 const __m128i one = _mm_set1_epi16(1);
637 const __m128i inc = _mm_set1_epi16(0x202);
638 const __m128i gat = _mm_set1_epi32(0xc080400);
639 __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
640 __m128i d = _mm_set1_epi16(0x100);
641
642 for (int i = 0; i < h; ++i) {
643 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
644 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
645 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
646 __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
647
648 __m128i b = _mm_shuffle_epi8(pixel[1], rep);
649 b = _mm_unpacklo_epi16(b, pixel[2]);
650 __m128i sum = _mm_madd_epi16(b, ww[0]);
651
652 sum = _mm_add_epi32(s, sum);
653 sum = _mm_add_epi32(sum, round);
654 sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
655
656 sum = _mm_shuffle_epi8(sum, gat);
657 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
658 dst += stride;
659
660 rep = _mm_add_epi16(rep, one);
661 d = _mm_add_epi16(d, inc);
662 }
663 }
664
aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)665 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
666 const uint8_t *above, const uint8_t *left) {
667 __m128i pixels[3];
668 load_pixel_w4(above, left, 4, pixels);
669
670 __m128i wh[4], ww[2];
671 load_weight_w4(sm_weight_arrays, 4, wh, ww);
672
673 smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
674 }
675
aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)676 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
677 const uint8_t *above, const uint8_t *left) {
678 __m128i pixels[3];
679 load_pixel_w4(above, left, 8, pixels);
680
681 __m128i wh[4], ww[2];
682 load_weight_w4(sm_weight_arrays, 8, wh, ww);
683
684 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
685 }
686
aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)687 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
688 const uint8_t *above,
689 const uint8_t *left) {
690 __m128i pixels[3];
691 load_pixel_w4(above, left, 16, pixels);
692
693 __m128i wh[4], ww[2];
694 load_weight_w4(sm_weight_arrays, 16, wh, ww);
695
696 smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
697 dst += stride << 3;
698 smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
699 }
700
701 // pixels[0]: above and below_pred interleave vector, first half
702 // pixels[1]: above and below_pred interleave vector, second half
703 // pixels[2]: left vector
704 // pixels[3]: right_pred vector
705 // pixels[4]: above and below_pred interleave vector, first half
706 // pixels[5]: above and below_pred interleave vector, second half
707 // pixels[6]: left vector + 16
708 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)709 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
710 int height, __m128i *pixels) {
711 const __m128i zero = _mm_setzero_si128();
712 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
713 __m128i d = _mm_loadl_epi64((const __m128i *)above);
714 d = _mm_unpacklo_epi8(d, zero);
715 pixels[0] = _mm_unpacklo_epi16(d, bp);
716 pixels[1] = _mm_unpackhi_epi16(d, bp);
717
718 pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
719
720 if (height == 4) {
721 pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
722 } else if (height == 8) {
723 pixels[2] = _mm_loadl_epi64((const __m128i *)left);
724 } else if (height == 16) {
725 pixels[2] = _mm_load_si128((const __m128i *)left);
726 } else {
727 pixels[2] = _mm_load_si128((const __m128i *)left);
728 pixels[4] = pixels[0];
729 pixels[5] = pixels[1];
730 pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
731 pixels[7] = pixels[3];
732 }
733 }
734
735 // weight_h[0]: weight_h vector
736 // weight_h[1]: scale - weight_h vector
737 // weight_h[2]: same as [0], offset 8
738 // weight_h[3]: same as [1], offset 8
739 // weight_h[4]: same as [0], offset 16
740 // weight_h[5]: same as [1], offset 16
741 // weight_h[6]: same as [0], offset 24
742 // weight_h[7]: same as [1], offset 24
743 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
744 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(const uint8_t * weight_array,int height,__m128i * weight_h,__m128i * weight_w)745 static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
746 __m128i *weight_h, __m128i *weight_w) {
747 const __m128i zero = _mm_setzero_si128();
748 const int we_offset = height < 8 ? 4 : 8;
749 __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
750 weight_h[0] = _mm_unpacklo_epi8(we, zero);
751 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
752 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
753
754 if (height == 4) {
755 we = _mm_srli_si128(we, 4);
756 __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
757 __m128i tmp2 = _mm_sub_epi16(d, tmp1);
758 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
759 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
760 } else {
761 weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
762 weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
763 }
764
765 if (height == 16) {
766 we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
767 weight_h[0] = _mm_unpacklo_epi8(we, zero);
768 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
769 weight_h[2] = _mm_unpackhi_epi8(we, zero);
770 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
771 } else if (height == 32) {
772 const __m128i weight_lo =
773 _mm_loadu_si128((const __m128i *)&weight_array[32]);
774 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
775 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
776 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
777 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
778 const __m128i weight_hi =
779 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
780 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
781 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
782 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
783 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
784 }
785 }
786
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)787 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
788 const __m128i *ww, int h, uint8_t *dst,
789 ptrdiff_t stride, int second_half) {
790 const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
791 const __m128i one = _mm_set1_epi16(1);
792 const __m128i inc = _mm_set1_epi16(0x202);
793 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
794
795 __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
796 __m128i d = _mm_set1_epi16(0x100);
797
798 int i;
799 for (i = 0; i < h; ++i) {
800 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
801 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
802 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
803 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
804 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
805
806 __m128i b = _mm_shuffle_epi8(pixels[2], rep);
807 b = _mm_unpacklo_epi16(b, pixels[3]);
808 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
809 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
810
811 s0 = _mm_add_epi32(s0, sum0);
812 s0 = _mm_add_epi32(s0, round);
813 s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
814
815 s1 = _mm_add_epi32(s1, sum1);
816 s1 = _mm_add_epi32(s1, round);
817 s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
818
819 sum0 = _mm_packus_epi16(s0, s1);
820 sum0 = _mm_shuffle_epi8(sum0, gat);
821 _mm_storel_epi64((__m128i *)dst, sum0);
822 dst += stride;
823
824 rep = _mm_add_epi16(rep, one);
825 d = _mm_add_epi16(d, inc);
826 }
827 }
828
aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)829 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
830 const uint8_t *above, const uint8_t *left) {
831 __m128i pixels[4];
832 load_pixel_w8(above, left, 4, pixels);
833
834 __m128i wh[4], ww[2];
835 load_weight_w8(sm_weight_arrays, 4, wh, ww);
836
837 smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
838 }
839
aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)840 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
841 const uint8_t *above, const uint8_t *left) {
842 __m128i pixels[4];
843 load_pixel_w8(above, left, 8, pixels);
844
845 __m128i wh[4], ww[2];
846 load_weight_w8(sm_weight_arrays, 8, wh, ww);
847
848 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
849 }
850
aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)851 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
852 const uint8_t *above,
853 const uint8_t *left) {
854 __m128i pixels[4];
855 load_pixel_w8(above, left, 16, pixels);
856
857 __m128i wh[4], ww[2];
858 load_weight_w8(sm_weight_arrays, 16, wh, ww);
859
860 smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
861 dst += stride << 3;
862 smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
863 }
864
aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)865 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
866 const uint8_t *above,
867 const uint8_t *left) {
868 __m128i pixels[8];
869 load_pixel_w8(above, left, 32, pixels);
870
871 __m128i wh[8], ww[2];
872 load_weight_w8(sm_weight_arrays, 32, wh, ww);
873
874 smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
875 dst += stride << 3;
876 smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
877 dst += stride << 3;
878 smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
879 dst += stride << 3;
880 smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
881 }
882
smooth_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)883 static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
884 const uint8_t *above,
885 const uint8_t *left, uint32_t bw,
886 uint32_t bh) {
887 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
888 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
889 const __m128i zero = _mm_setzero_si128();
890 const __m128i scale_value =
891 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
892 const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
893 const __m128i dup16 = _mm_set1_epi32(0x01000100);
894 const __m128i top_right =
895 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
896 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
897 const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
898
899 for (uint32_t y = 0; y < bh; ++y) {
900 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
901 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
902 const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
903 __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
904 const __m128i wl_y =
905 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
906 pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
907 pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
908
909 for (uint32_t x = 0; x < bw; x += 8) {
910 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
911 const __m128i weights_x =
912 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
913 const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
914 const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
915 const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
916
917 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
918 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
919
920 const __m128i scale_m_weights_x =
921 _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
922 const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
923 const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
924 const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
925
926 pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
927 pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
928
929 pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
930 pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
931
932 pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
933 pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
934
935 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
936 pred = _mm_shuffle_epi8(pred, gat);
937 _mm_storel_epi64((__m128i *)(dst + x), pred);
938 }
939 dst += stride;
940 }
941 }
942
aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)943 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
944 const uint8_t *above,
945 const uint8_t *left) {
946 smooth_predictor_wxh(dst, stride, above, left, 16, 4);
947 }
948
aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)949 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
950 const uint8_t *above,
951 const uint8_t *left) {
952 smooth_predictor_wxh(dst, stride, above, left, 16, 8);
953 }
954
aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)955 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
956 const uint8_t *above,
957 const uint8_t *left) {
958 smooth_predictor_wxh(dst, stride, above, left, 16, 16);
959 }
960
aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)961 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
962 const uint8_t *above,
963 const uint8_t *left) {
964 smooth_predictor_wxh(dst, stride, above, left, 16, 32);
965 }
966
aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)967 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
968 const uint8_t *above,
969 const uint8_t *left) {
970 smooth_predictor_wxh(dst, stride, above, left, 32, 8);
971 }
972
aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)973 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
974 const uint8_t *above,
975 const uint8_t *left) {
976 smooth_predictor_wxh(dst, stride, above, left, 32, 16);
977 }
978
aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)979 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
980 const uint8_t *above,
981 const uint8_t *left) {
982 smooth_predictor_wxh(dst, stride, above, left, 32, 32);
983 }
984
aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)985 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
986 const uint8_t *above,
987 const uint8_t *left) {
988 smooth_predictor_wxh(dst, stride, above, left, 32, 64);
989 }
990
aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)991 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
992 const uint8_t *above,
993 const uint8_t *left) {
994 smooth_predictor_wxh(dst, stride, above, left, 64, 64);
995 }
996
aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)997 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
998 const uint8_t *above,
999 const uint8_t *left) {
1000 smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1001 }
1002
aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1003 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1004 const uint8_t *above,
1005 const uint8_t *left) {
1006 smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1007 }
1008
aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1009 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1010 const uint8_t *above,
1011 const uint8_t *left) {
1012 smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1013 }
1014
1015 // -----------------------------------------------------------------------------
1016 // SMOOTH_V_PRED
1017
1018 // pixels[0]: above and below_pred interleave vector
load_pixel_v_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1019 static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
1020 int height, __m128i *pixels) {
1021 const __m128i zero = _mm_setzero_si128();
1022 __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
1023 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1024 d = _mm_unpacklo_epi8(d, zero);
1025 pixels[0] = _mm_unpacklo_epi16(d, bp);
1026 }
1027
1028 // weights[0]: weights_h vector
1029 // weights[1]: scale - weights_h vector
load_weight_v_w4(const uint8_t * weight_array,int height,__m128i * weights)1030 static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
1031 __m128i *weights) {
1032 const __m128i zero = _mm_setzero_si128();
1033 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1034
1035 if (height == 4) {
1036 const __m128i weight =
1037 _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
1038 weights[0] = _mm_unpacklo_epi8(weight, zero);
1039 weights[1] = _mm_sub_epi16(d, weights[0]);
1040 } else if (height == 8) {
1041 const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
1042 weights[0] = _mm_unpacklo_epi8(weight, zero);
1043 weights[1] = _mm_sub_epi16(d, weights[0]);
1044 } else {
1045 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1046 weights[0] = _mm_unpacklo_epi8(weight, zero);
1047 weights[1] = _mm_sub_epi16(d, weights[0]);
1048 weights[2] = _mm_unpackhi_epi8(weight, zero);
1049 weights[3] = _mm_sub_epi16(d, weights[2]);
1050 }
1051 }
1052
smooth_v_pred_4xh(const __m128i * pixel,const __m128i * weight,int h,uint8_t * dst,ptrdiff_t stride)1053 static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
1054 const __m128i *weight, int h, uint8_t *dst,
1055 ptrdiff_t stride) {
1056 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1057 const __m128i inc = _mm_set1_epi16(0x202);
1058 const __m128i gat = _mm_set1_epi32(0xc080400);
1059 __m128i d = _mm_set1_epi16(0x100);
1060
1061 for (int i = 0; i < h; ++i) {
1062 const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
1063 const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
1064 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1065 __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
1066 sum = _mm_add_epi32(sum, pred_round);
1067 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1068 sum = _mm_shuffle_epi8(sum, gat);
1069 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1070 dst += stride;
1071 d = _mm_add_epi16(d, inc);
1072 }
1073 }
1074
aom_smooth_v_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1075 void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1076 const uint8_t *above,
1077 const uint8_t *left) {
1078 __m128i pixels;
1079 load_pixel_v_w4(above, left, 4, &pixels);
1080
1081 __m128i weights[2];
1082 load_weight_v_w4(sm_weight_arrays, 4, weights);
1083
1084 smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
1085 }
1086
aom_smooth_v_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1087 void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1088 const uint8_t *above,
1089 const uint8_t *left) {
1090 __m128i pixels;
1091 load_pixel_v_w4(above, left, 8, &pixels);
1092
1093 __m128i weights[2];
1094 load_weight_v_w4(sm_weight_arrays, 8, weights);
1095
1096 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1097 }
1098
aom_smooth_v_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1099 void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1100 const uint8_t *above,
1101 const uint8_t *left) {
1102 __m128i pixels;
1103 load_pixel_v_w4(above, left, 16, &pixels);
1104
1105 __m128i weights[4];
1106 load_weight_v_w4(sm_weight_arrays, 16, weights);
1107
1108 smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
1109 dst += stride << 3;
1110 smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
1111 }
1112
1113 // pixels[0]: above and below_pred interleave vector, first half
1114 // pixels[1]: above and below_pred interleave vector, second half
load_pixel_v_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1115 static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
1116 int height, __m128i *pixels) {
1117 const __m128i zero = _mm_setzero_si128();
1118 __m128i d = _mm_loadl_epi64((const __m128i *)above);
1119 const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
1120 d = _mm_unpacklo_epi8(d, zero);
1121 pixels[0] = _mm_unpacklo_epi16(d, bp);
1122 pixels[1] = _mm_unpackhi_epi16(d, bp);
1123 }
1124
1125 // weight_h[0]: weight_h vector
1126 // weight_h[1]: scale - weight_h vector
1127 // weight_h[2]: same as [0], offset 8
1128 // weight_h[3]: same as [1], offset 8
1129 // weight_h[4]: same as [0], offset 16
1130 // weight_h[5]: same as [1], offset 16
1131 // weight_h[6]: same as [0], offset 24
1132 // weight_h[7]: same as [1], offset 24
load_weight_v_w8(const uint8_t * weight_array,int height,__m128i * weight_h)1133 static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
1134 __m128i *weight_h) {
1135 const __m128i zero = _mm_setzero_si128();
1136 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1137
1138 if (height < 16) {
1139 const int offset = height < 8 ? 4 : 8;
1140 const __m128i weight =
1141 _mm_loadu_si128((const __m128i *)&weight_array[offset]);
1142 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1143 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1144 } else if (height == 16) {
1145 const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
1146 weight_h[0] = _mm_unpacklo_epi8(weight, zero);
1147 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1148 weight_h[2] = _mm_unpackhi_epi8(weight, zero);
1149 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1150 } else {
1151 const __m128i weight_lo =
1152 _mm_loadu_si128((const __m128i *)&weight_array[32]);
1153 weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
1154 weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
1155 weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
1156 weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
1157 const __m128i weight_hi =
1158 _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
1159 weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
1160 weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
1161 weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
1162 weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
1163 }
1164 }
1165
smooth_v_pred_8xh(const __m128i * pixels,const __m128i * wh,int h,uint8_t * dst,ptrdiff_t stride)1166 static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
1167 int h, uint8_t *dst, ptrdiff_t stride) {
1168 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1169 const __m128i inc = _mm_set1_epi16(0x202);
1170 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1171 __m128i d = _mm_set1_epi16(0x100);
1172
1173 for (int i = 0; i < h; ++i) {
1174 const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
1175 const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
1176 const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
1177 __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
1178 __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
1179
1180 s0 = _mm_add_epi32(s0, pred_round);
1181 s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
1182
1183 s1 = _mm_add_epi32(s1, pred_round);
1184 s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
1185
1186 __m128i sum01 = _mm_packus_epi16(s0, s1);
1187 sum01 = _mm_shuffle_epi8(sum01, gat);
1188 _mm_storel_epi64((__m128i *)dst, sum01);
1189 dst += stride;
1190
1191 d = _mm_add_epi16(d, inc);
1192 }
1193 }
1194
aom_smooth_v_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1195 void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1196 const uint8_t *above,
1197 const uint8_t *left) {
1198 __m128i pixels[2];
1199 load_pixel_v_w8(above, left, 4, pixels);
1200
1201 __m128i wh[2];
1202 load_weight_v_w8(sm_weight_arrays, 4, wh);
1203
1204 smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
1205 }
1206
aom_smooth_v_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1207 void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1208 const uint8_t *above,
1209 const uint8_t *left) {
1210 __m128i pixels[2];
1211 load_pixel_v_w8(above, left, 8, pixels);
1212
1213 __m128i wh[2];
1214 load_weight_v_w8(sm_weight_arrays, 8, wh);
1215
1216 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1217 }
1218
aom_smooth_v_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1219 void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1220 const uint8_t *above,
1221 const uint8_t *left) {
1222 __m128i pixels[2];
1223 load_pixel_v_w8(above, left, 16, pixels);
1224
1225 __m128i wh[4];
1226 load_weight_v_w8(sm_weight_arrays, 16, wh);
1227
1228 smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
1229 dst += stride << 3;
1230 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1231 }
1232
aom_smooth_v_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1233 void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1234 const uint8_t *above,
1235 const uint8_t *left) {
1236 __m128i pixels[2];
1237 load_pixel_v_w8(above, left, 32, pixels);
1238
1239 __m128i wh[8];
1240 load_weight_v_w8(sm_weight_arrays, 32, wh);
1241
1242 smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
1243 dst += stride << 3;
1244 smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
1245 dst += stride << 3;
1246 smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
1247 dst += stride << 3;
1248 smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
1249 }
1250
smooth_v_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1251 static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1252 const uint8_t *above,
1253 const uint8_t *left, uint32_t bw,
1254 uint32_t bh) {
1255 const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
1256 const __m128i zero = _mm_setzero_si128();
1257 const __m128i scale_value =
1258 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1259 const __m128i dup16 = _mm_set1_epi32(0x01000100);
1260 const __m128i bottom_left =
1261 _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
1262 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1263 const __m128i round =
1264 _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
1265
1266 for (uint32_t y = 0; y < bh; ++y) {
1267 const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
1268 const __m128i scale_m_weights_y =
1269 _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
1270 const __m128i wl_y =
1271 _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
1272
1273 for (uint32_t x = 0; x < bw; x += 8) {
1274 const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
1275 // 8 -> 16
1276 const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
1277 const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
1278 const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
1279 // top_x * weights_y + scale_m_weights_y * bottom_left
1280 __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
1281 __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
1282
1283 pred_lo = _mm_add_epi32(pred_lo, round);
1284 pred_hi = _mm_add_epi32(pred_hi, round);
1285 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1286 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1287
1288 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1289 pred = _mm_shuffle_epi8(pred, gat);
1290 _mm_storel_epi64((__m128i *)(dst + x), pred);
1291 }
1292 dst += stride;
1293 }
1294 }
1295
aom_smooth_v_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1296 void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1297 const uint8_t *above,
1298 const uint8_t *left) {
1299 smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
1300 }
1301
aom_smooth_v_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1302 void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1303 const uint8_t *above,
1304 const uint8_t *left) {
1305 smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
1306 }
1307
aom_smooth_v_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1308 void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1309 const uint8_t *above,
1310 const uint8_t *left) {
1311 smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
1312 }
1313
aom_smooth_v_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1314 void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1315 const uint8_t *above,
1316 const uint8_t *left) {
1317 smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
1318 }
1319
aom_smooth_v_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1320 void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1321 const uint8_t *above,
1322 const uint8_t *left) {
1323 smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
1324 }
1325
aom_smooth_v_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1326 void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1327 const uint8_t *above,
1328 const uint8_t *left) {
1329 smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
1330 }
1331
aom_smooth_v_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1332 void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1333 const uint8_t *above,
1334 const uint8_t *left) {
1335 smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
1336 }
1337
aom_smooth_v_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1338 void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1339 const uint8_t *above,
1340 const uint8_t *left) {
1341 smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
1342 }
1343
aom_smooth_v_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1344 void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1345 const uint8_t *above,
1346 const uint8_t *left) {
1347 smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
1348 }
1349
aom_smooth_v_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1350 void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1351 const uint8_t *above,
1352 const uint8_t *left) {
1353 smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
1354 }
1355
aom_smooth_v_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1356 void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1357 const uint8_t *above,
1358 const uint8_t *left) {
1359 smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
1360 }
1361
aom_smooth_v_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1362 void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1363 const uint8_t *above,
1364 const uint8_t *left) {
1365 smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
1366 }
1367
1368 // -----------------------------------------------------------------------------
1369 // SMOOTH_H_PRED
1370
1371 // pixels[0]: left vector
1372 // pixels[1]: right_pred vector
load_pixel_h_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1373 static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
1374 int height, __m128i *pixels) {
1375 if (height == 4)
1376 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1377 else if (height == 8)
1378 pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
1379 else
1380 pixels[0] = _mm_loadu_si128(((const __m128i *)left));
1381 pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
1382 }
1383
1384 // weights[0]: weights_w and scale - weights_w interleave vector
load_weight_h_w4(const uint8_t * weight_array,int height,__m128i * weights)1385 static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
1386 __m128i *weights) {
1387 (void)height;
1388 const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
1389 const __m128i zero = _mm_setzero_si128();
1390
1391 const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
1392 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1393 const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
1394 weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
1395 }
1396
smooth_h_pred_4xh(const __m128i * pixel,const __m128i * weight,int h,uint8_t * dst,ptrdiff_t stride)1397 static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
1398 const __m128i *weight, int h, uint8_t *dst,
1399 ptrdiff_t stride) {
1400 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1401 const __m128i one = _mm_set1_epi16(1);
1402 const __m128i gat = _mm_set1_epi32(0xc080400);
1403 __m128i rep = _mm_set1_epi16(0x8000);
1404
1405 for (int i = 0; i < h; ++i) {
1406 __m128i b = _mm_shuffle_epi8(pixel[0], rep);
1407 b = _mm_unpacklo_epi16(b, pixel[1]);
1408 __m128i sum = _mm_madd_epi16(b, weight[0]);
1409
1410 sum = _mm_add_epi32(sum, pred_round);
1411 sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
1412
1413 sum = _mm_shuffle_epi8(sum, gat);
1414 *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
1415 dst += stride;
1416
1417 rep = _mm_add_epi16(rep, one);
1418 }
1419 }
1420
aom_smooth_h_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1421 void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1422 const uint8_t *above,
1423 const uint8_t *left) {
1424 __m128i pixels[2];
1425 load_pixel_h_w4(above, left, 4, pixels);
1426
1427 __m128i weights;
1428 load_weight_h_w4(sm_weight_arrays, 4, &weights);
1429
1430 smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
1431 }
1432
aom_smooth_h_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1433 void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1434 const uint8_t *above,
1435 const uint8_t *left) {
1436 __m128i pixels[2];
1437 load_pixel_h_w4(above, left, 8, pixels);
1438
1439 __m128i weights;
1440 load_weight_h_w4(sm_weight_arrays, 8, &weights);
1441
1442 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1443 }
1444
aom_smooth_h_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1445 void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1446 const uint8_t *above,
1447 const uint8_t *left) {
1448 __m128i pixels[2];
1449 load_pixel_h_w4(above, left, 16, pixels);
1450
1451 __m128i weights;
1452 load_weight_h_w4(sm_weight_arrays, 8, &weights);
1453
1454 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1455 dst += stride << 3;
1456
1457 pixels[0] = _mm_srli_si128(pixels[0], 8);
1458 smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
1459 }
1460
1461 // pixels[0]: left vector
1462 // pixels[1]: right_pred vector
1463 // pixels[2]: left vector + 16
1464 // pixels[3]: right_pred vector
load_pixel_h_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)1465 static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
1466 int height, __m128i *pixels) {
1467 pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
1468
1469 if (height == 4) {
1470 pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
1471 } else if (height == 8) {
1472 pixels[0] = _mm_loadl_epi64((const __m128i *)left);
1473 } else if (height == 16) {
1474 pixels[0] = _mm_load_si128((const __m128i *)left);
1475 } else {
1476 pixels[0] = _mm_load_si128((const __m128i *)left);
1477 pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
1478 pixels[3] = pixels[1];
1479 }
1480 }
1481
1482 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
1483 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_h_w8(const uint8_t * weight_array,int height,__m128i * weight_w)1484 static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
1485 __m128i *weight_w) {
1486 (void)height;
1487 const __m128i zero = _mm_setzero_si128();
1488 const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1489 const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
1490 const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
1491 const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
1492 weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
1493 weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
1494 }
1495
smooth_h_pred_8xh(const __m128i * pixels,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)1496 static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
1497 int h, uint8_t *dst, ptrdiff_t stride,
1498 int second_half) {
1499 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1500 const __m128i one = _mm_set1_epi16(1);
1501 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1502 __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
1503
1504 for (int i = 0; i < h; ++i) {
1505 __m128i b = _mm_shuffle_epi8(pixels[0], rep);
1506 b = _mm_unpacklo_epi16(b, pixels[1]);
1507 __m128i sum0 = _mm_madd_epi16(b, ww[0]);
1508 __m128i sum1 = _mm_madd_epi16(b, ww[1]);
1509
1510 sum0 = _mm_add_epi32(sum0, pred_round);
1511 sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
1512
1513 sum1 = _mm_add_epi32(sum1, pred_round);
1514 sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
1515
1516 sum0 = _mm_packus_epi16(sum0, sum1);
1517 sum0 = _mm_shuffle_epi8(sum0, gat);
1518 _mm_storel_epi64((__m128i *)dst, sum0);
1519 dst += stride;
1520
1521 rep = _mm_add_epi16(rep, one);
1522 }
1523 }
1524
aom_smooth_h_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1525 void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1526 const uint8_t *above,
1527 const uint8_t *left) {
1528 __m128i pixels[2];
1529 load_pixel_h_w8(above, left, 4, pixels);
1530
1531 __m128i ww[2];
1532 load_weight_h_w8(sm_weight_arrays, 4, ww);
1533
1534 smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
1535 }
1536
aom_smooth_h_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1537 void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1538 const uint8_t *above,
1539 const uint8_t *left) {
1540 __m128i pixels[2];
1541 load_pixel_h_w8(above, left, 8, pixels);
1542
1543 __m128i ww[2];
1544 load_weight_h_w8(sm_weight_arrays, 8, ww);
1545
1546 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1547 }
1548
aom_smooth_h_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1549 void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1550 const uint8_t *above,
1551 const uint8_t *left) {
1552 __m128i pixels[2];
1553 load_pixel_h_w8(above, left, 16, pixels);
1554
1555 __m128i ww[2];
1556 load_weight_h_w8(sm_weight_arrays, 16, ww);
1557
1558 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
1559 dst += stride << 3;
1560 smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
1561 }
1562
aom_smooth_h_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1563 void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1564 const uint8_t *above,
1565 const uint8_t *left) {
1566 __m128i pixels[4];
1567 load_pixel_h_w8(above, left, 32, pixels);
1568
1569 __m128i ww[2];
1570 load_weight_h_w8(sm_weight_arrays, 32, ww);
1571
1572 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
1573 dst += stride << 3;
1574 smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
1575 dst += stride << 3;
1576 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
1577 dst += stride << 3;
1578 smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
1579 }
1580
smooth_h_predictor_wxh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,uint32_t bw,uint32_t bh)1581 static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
1582 const uint8_t *above,
1583 const uint8_t *left, uint32_t bw,
1584 uint32_t bh) {
1585 const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
1586 const __m128i zero = _mm_setzero_si128();
1587 const __m128i scale_value =
1588 _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
1589 const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
1590 const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
1591 const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
1592
1593 for (uint32_t y = 0; y < bh; ++y) {
1594 const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
1595 const __m128i tr_ly =
1596 _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
1597
1598 for (uint32_t x = 0; x < bw; x += 8) {
1599 const __m128i weights_x =
1600 _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
1601 const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
1602 const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
1603 const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
1604 const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
1605 __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
1606 __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
1607
1608 pred_lo = _mm_add_epi32(pred_lo, pred_round);
1609 pred_hi = _mm_add_epi32(pred_hi, pred_round);
1610
1611 pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
1612 pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
1613
1614 __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
1615 pred = _mm_shuffle_epi8(pred, gat);
1616 _mm_storel_epi64((__m128i *)(dst + x), pred);
1617 }
1618 dst += stride;
1619 }
1620 }
1621
aom_smooth_h_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1622 void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1623 const uint8_t *above,
1624 const uint8_t *left) {
1625 smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
1626 }
1627
aom_smooth_h_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1628 void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1629 const uint8_t *above,
1630 const uint8_t *left) {
1631 smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
1632 }
1633
aom_smooth_h_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1634 void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1635 const uint8_t *above,
1636 const uint8_t *left) {
1637 smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
1638 }
1639
aom_smooth_h_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1640 void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1641 const uint8_t *above,
1642 const uint8_t *left) {
1643 smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
1644 }
1645
aom_smooth_h_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1646 void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1647 const uint8_t *above,
1648 const uint8_t *left) {
1649 smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
1650 }
1651
aom_smooth_h_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1652 void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1653 const uint8_t *above,
1654 const uint8_t *left) {
1655 smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
1656 }
1657
aom_smooth_h_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1658 void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1659 const uint8_t *above,
1660 const uint8_t *left) {
1661 smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
1662 }
1663
aom_smooth_h_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1664 void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1665 const uint8_t *above,
1666 const uint8_t *left) {
1667 smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
1668 }
1669
aom_smooth_h_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1670 void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1671 const uint8_t *above,
1672 const uint8_t *left) {
1673 smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
1674 }
1675
aom_smooth_h_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1676 void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1677 const uint8_t *above,
1678 const uint8_t *left) {
1679 smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
1680 }
1681
aom_smooth_h_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1682 void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1683 const uint8_t *above,
1684 const uint8_t *left) {
1685 smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
1686 }
1687
aom_smooth_h_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1688 void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1689 const uint8_t *above,
1690 const uint8_t *left) {
1691 smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
1692 }
1693