• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <tmmintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 #include "aom_dsp/intrapred_common.h"
17 
18 // -----------------------------------------------------------------------------
19 // PAETH_PRED
20 
21 // Return 8 16-bit pixels in one row
paeth_8x1_pred(const __m128i * left,const __m128i * top,const __m128i * topleft)22 static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
23                                      const __m128i *topleft) {
24   const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
25 
26   __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
27   __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
28   __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
29 
30   __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
31   mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
32   __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
33 
34   pl = _mm_andnot_si128(mask1, *left);
35 
36   ptl = _mm_and_si128(mask2, *topleft);
37   pt = _mm_andnot_si128(mask2, *top);
38   pt = _mm_or_si128(pt, ptl);
39   pt = _mm_and_si128(mask1, pt);
40 
41   return _mm_or_si128(pl, pt);
42 }
43 
aom_paeth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
45                                    const uint8_t *above, const uint8_t *left) {
46   __m128i l = _mm_loadl_epi64((const __m128i *)left);
47   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
48   const __m128i zero = _mm_setzero_si128();
49   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
50   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
51   __m128i rep = _mm_set1_epi16((short)0x8000);
52   const __m128i one = _mm_set1_epi16(1);
53 
54   int i;
55   for (i = 0; i < 4; ++i) {
56     const __m128i l16 = _mm_shuffle_epi8(l, rep);
57     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
58 
59     *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
60     dst += stride;
61     rep = _mm_add_epi16(rep, one);
62   }
63 }
64 
aom_paeth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)65 void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
66                                    const uint8_t *above, const uint8_t *left) {
67   __m128i l = _mm_loadl_epi64((const __m128i *)left);
68   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
69   const __m128i zero = _mm_setzero_si128();
70   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
71   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
72   __m128i rep = _mm_set1_epi16((short)0x8000);
73   const __m128i one = _mm_set1_epi16(1);
74 
75   int i;
76   for (i = 0; i < 8; ++i) {
77     const __m128i l16 = _mm_shuffle_epi8(l, rep);
78     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
79 
80     *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
81     dst += stride;
82     rep = _mm_add_epi16(rep, one);
83   }
84 }
85 
aom_paeth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)86 void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
87                                     const uint8_t *above, const uint8_t *left) {
88   __m128i l = _mm_load_si128((const __m128i *)left);
89   const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
90   const __m128i zero = _mm_setzero_si128();
91   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
92   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
93   __m128i rep = _mm_set1_epi16((short)0x8000);
94   const __m128i one = _mm_set1_epi16(1);
95 
96   for (int i = 0; i < 16; ++i) {
97     const __m128i l16 = _mm_shuffle_epi8(l, rep);
98     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
99 
100     *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
101     dst += stride;
102     rep = _mm_add_epi16(rep, one);
103   }
104 }
105 
aom_paeth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)106 void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
107                                    const uint8_t *above, const uint8_t *left) {
108   __m128i l = _mm_loadl_epi64((const __m128i *)left);
109   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
110   const __m128i zero = _mm_setzero_si128();
111   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
112   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
113   __m128i rep = _mm_set1_epi16((short)0x8000);
114   const __m128i one = _mm_set1_epi16(1);
115 
116   int i;
117   for (i = 0; i < 4; ++i) {
118     const __m128i l16 = _mm_shuffle_epi8(l, rep);
119     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
120 
121     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
122     dst += stride;
123     rep = _mm_add_epi16(rep, one);
124   }
125 }
126 
aom_paeth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)127 void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
128                                    const uint8_t *above, const uint8_t *left) {
129   __m128i l = _mm_loadl_epi64((const __m128i *)left);
130   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
131   const __m128i zero = _mm_setzero_si128();
132   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
133   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
134   __m128i rep = _mm_set1_epi16((short)0x8000);
135   const __m128i one = _mm_set1_epi16(1);
136 
137   int i;
138   for (i = 0; i < 8; ++i) {
139     const __m128i l16 = _mm_shuffle_epi8(l, rep);
140     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
141 
142     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
143     dst += stride;
144     rep = _mm_add_epi16(rep, one);
145   }
146 }
147 
aom_paeth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)148 void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
149                                     const uint8_t *above, const uint8_t *left) {
150   __m128i l = _mm_load_si128((const __m128i *)left);
151   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
152   const __m128i zero = _mm_setzero_si128();
153   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
154   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
155   __m128i rep = _mm_set1_epi16((short)0x8000);
156   const __m128i one = _mm_set1_epi16(1);
157 
158   int i;
159   for (i = 0; i < 16; ++i) {
160     const __m128i l16 = _mm_shuffle_epi8(l, rep);
161     const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
162 
163     _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
164     dst += stride;
165     rep = _mm_add_epi16(rep, one);
166   }
167 }
168 
aom_paeth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)169 void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
170                                     const uint8_t *above, const uint8_t *left) {
171   const __m128i t = _mm_loadl_epi64((const __m128i *)above);
172   const __m128i zero = _mm_setzero_si128();
173   const __m128i t16 = _mm_unpacklo_epi8(t, zero);
174   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
175   const __m128i one = _mm_set1_epi16(1);
176 
177   for (int j = 0; j < 2; ++j) {
178     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
179     __m128i rep = _mm_set1_epi16((short)0x8000);
180     for (int i = 0; i < 16; ++i) {
181       const __m128i l16 = _mm_shuffle_epi8(l, rep);
182       const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
183 
184       _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
185       dst += stride;
186       rep = _mm_add_epi16(rep, one);
187     }
188   }
189 }
190 
191 // Return 16 8-bit pixels in one row
paeth_16x1_pred(const __m128i * left,const __m128i * top0,const __m128i * top1,const __m128i * topleft)192 static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
193                                       const __m128i *top1,
194                                       const __m128i *topleft) {
195   const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
196   const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
197   return _mm_packus_epi16(p0, p1);
198 }
199 
aom_paeth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)200 void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
201                                     const uint8_t *above, const uint8_t *left) {
202   __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
203   const __m128i t = _mm_load_si128((const __m128i *)above);
204   const __m128i zero = _mm_setzero_si128();
205   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
206   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
207   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
208   __m128i rep = _mm_set1_epi16((short)0x8000);
209   const __m128i one = _mm_set1_epi16(1);
210 
211   for (int i = 0; i < 4; ++i) {
212     const __m128i l16 = _mm_shuffle_epi8(l, rep);
213     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
214 
215     _mm_store_si128((__m128i *)dst, row);
216     dst += stride;
217     rep = _mm_add_epi16(rep, one);
218   }
219 }
220 
aom_paeth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)221 void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
222                                     const uint8_t *above, const uint8_t *left) {
223   __m128i l = _mm_loadl_epi64((const __m128i *)left);
224   const __m128i t = _mm_load_si128((const __m128i *)above);
225   const __m128i zero = _mm_setzero_si128();
226   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
227   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
228   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
229   __m128i rep = _mm_set1_epi16((short)0x8000);
230   const __m128i one = _mm_set1_epi16(1);
231 
232   int i;
233   for (i = 0; i < 8; ++i) {
234     const __m128i l16 = _mm_shuffle_epi8(l, rep);
235     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
236 
237     _mm_store_si128((__m128i *)dst, row);
238     dst += stride;
239     rep = _mm_add_epi16(rep, one);
240   }
241 }
242 
aom_paeth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)243 void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
244                                      const uint8_t *above,
245                                      const uint8_t *left) {
246   __m128i l = _mm_load_si128((const __m128i *)left);
247   const __m128i t = _mm_load_si128((const __m128i *)above);
248   const __m128i zero = _mm_setzero_si128();
249   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
250   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
251   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
252   __m128i rep = _mm_set1_epi16((short)0x8000);
253   const __m128i one = _mm_set1_epi16(1);
254 
255   int i;
256   for (i = 0; i < 16; ++i) {
257     const __m128i l16 = _mm_shuffle_epi8(l, rep);
258     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
259 
260     _mm_store_si128((__m128i *)dst, row);
261     dst += stride;
262     rep = _mm_add_epi16(rep, one);
263   }
264 }
265 
aom_paeth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)266 void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
267                                      const uint8_t *above,
268                                      const uint8_t *left) {
269   __m128i l = _mm_load_si128((const __m128i *)left);
270   const __m128i t = _mm_load_si128((const __m128i *)above);
271   const __m128i zero = _mm_setzero_si128();
272   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
273   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
274   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
275   __m128i rep = _mm_set1_epi16((short)0x8000);
276   const __m128i one = _mm_set1_epi16(1);
277   __m128i l16;
278 
279   int i;
280   for (i = 0; i < 16; ++i) {
281     l16 = _mm_shuffle_epi8(l, rep);
282     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
283 
284     _mm_store_si128((__m128i *)dst, row);
285     dst += stride;
286     rep = _mm_add_epi16(rep, one);
287   }
288 
289   l = _mm_load_si128((const __m128i *)(left + 16));
290   rep = _mm_set1_epi16((short)0x8000);
291   for (i = 0; i < 16; ++i) {
292     l16 = _mm_shuffle_epi8(l, rep);
293     const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
294 
295     _mm_store_si128((__m128i *)dst, row);
296     dst += stride;
297     rep = _mm_add_epi16(rep, one);
298   }
299 }
300 
aom_paeth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)301 void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
302                                      const uint8_t *above,
303                                      const uint8_t *left) {
304   const __m128i t = _mm_load_si128((const __m128i *)above);
305   const __m128i zero = _mm_setzero_si128();
306   const __m128i top0 = _mm_unpacklo_epi8(t, zero);
307   const __m128i top1 = _mm_unpackhi_epi8(t, zero);
308   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
309   const __m128i one = _mm_set1_epi16(1);
310 
311   for (int j = 0; j < 4; ++j) {
312     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
313     __m128i rep = _mm_set1_epi16((short)0x8000);
314     for (int i = 0; i < 16; ++i) {
315       const __m128i l16 = _mm_shuffle_epi8(l, rep);
316       const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
317       _mm_store_si128((__m128i *)dst, row);
318       dst += stride;
319       rep = _mm_add_epi16(rep, one);
320     }
321   }
322 }
323 
aom_paeth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)324 void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
325                                     const uint8_t *above, const uint8_t *left) {
326   const __m128i a = _mm_load_si128((const __m128i *)above);
327   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
328   const __m128i zero = _mm_setzero_si128();
329   const __m128i al = _mm_unpacklo_epi8(a, zero);
330   const __m128i ah = _mm_unpackhi_epi8(a, zero);
331   const __m128i bl = _mm_unpacklo_epi8(b, zero);
332   const __m128i bh = _mm_unpackhi_epi8(b, zero);
333 
334   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
335   __m128i rep = _mm_set1_epi16((short)0x8000);
336   const __m128i one = _mm_set1_epi16(1);
337   const __m128i l = _mm_loadl_epi64((const __m128i *)left);
338   __m128i l16;
339 
340   for (int i = 0; i < 8; ++i) {
341     l16 = _mm_shuffle_epi8(l, rep);
342     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
343     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
344 
345     _mm_store_si128((__m128i *)dst, r32l);
346     _mm_store_si128((__m128i *)(dst + 16), r32h);
347     dst += stride;
348     rep = _mm_add_epi16(rep, one);
349   }
350 }
351 
aom_paeth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)352 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
353                                      const uint8_t *above,
354                                      const uint8_t *left) {
355   const __m128i a = _mm_load_si128((const __m128i *)above);
356   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
357   const __m128i zero = _mm_setzero_si128();
358   const __m128i al = _mm_unpacklo_epi8(a, zero);
359   const __m128i ah = _mm_unpackhi_epi8(a, zero);
360   const __m128i bl = _mm_unpacklo_epi8(b, zero);
361   const __m128i bh = _mm_unpackhi_epi8(b, zero);
362 
363   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
364   __m128i rep = _mm_set1_epi16((short)0x8000);
365   const __m128i one = _mm_set1_epi16(1);
366   __m128i l = _mm_load_si128((const __m128i *)left);
367   __m128i l16;
368 
369   int i;
370   for (i = 0; i < 16; ++i) {
371     l16 = _mm_shuffle_epi8(l, rep);
372     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
373     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
374 
375     _mm_store_si128((__m128i *)dst, r32l);
376     _mm_store_si128((__m128i *)(dst + 16), r32h);
377     dst += stride;
378     rep = _mm_add_epi16(rep, one);
379   }
380 }
381 
aom_paeth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)382 void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
383                                      const uint8_t *above,
384                                      const uint8_t *left) {
385   const __m128i a = _mm_load_si128((const __m128i *)above);
386   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
387   const __m128i zero = _mm_setzero_si128();
388   const __m128i al = _mm_unpacklo_epi8(a, zero);
389   const __m128i ah = _mm_unpackhi_epi8(a, zero);
390   const __m128i bl = _mm_unpacklo_epi8(b, zero);
391   const __m128i bh = _mm_unpackhi_epi8(b, zero);
392 
393   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
394   __m128i rep = _mm_set1_epi16((short)0x8000);
395   const __m128i one = _mm_set1_epi16(1);
396   __m128i l = _mm_load_si128((const __m128i *)left);
397   __m128i l16;
398 
399   int i;
400   for (i = 0; i < 16; ++i) {
401     l16 = _mm_shuffle_epi8(l, rep);
402     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
403     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
404 
405     _mm_store_si128((__m128i *)dst, r32l);
406     _mm_store_si128((__m128i *)(dst + 16), r32h);
407     dst += stride;
408     rep = _mm_add_epi16(rep, one);
409   }
410 
411   rep = _mm_set1_epi16((short)0x8000);
412   l = _mm_load_si128((const __m128i *)(left + 16));
413   for (i = 0; i < 16; ++i) {
414     l16 = _mm_shuffle_epi8(l, rep);
415     const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
416     const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
417 
418     _mm_store_si128((__m128i *)dst, r32l);
419     _mm_store_si128((__m128i *)(dst + 16), r32h);
420     dst += stride;
421     rep = _mm_add_epi16(rep, one);
422   }
423 }
424 
aom_paeth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)425 void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
426                                      const uint8_t *above,
427                                      const uint8_t *left) {
428   const __m128i a = _mm_load_si128((const __m128i *)above);
429   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
430   const __m128i zero = _mm_setzero_si128();
431   const __m128i al = _mm_unpacklo_epi8(a, zero);
432   const __m128i ah = _mm_unpackhi_epi8(a, zero);
433   const __m128i bl = _mm_unpacklo_epi8(b, zero);
434   const __m128i bh = _mm_unpackhi_epi8(b, zero);
435 
436   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
437   const __m128i one = _mm_set1_epi16(1);
438   __m128i l16;
439 
440   int i, j;
441   for (j = 0; j < 4; ++j) {
442     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
443     __m128i rep = _mm_set1_epi16((short)0x8000);
444     for (i = 0; i < 16; ++i) {
445       l16 = _mm_shuffle_epi8(l, rep);
446       const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
447       const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
448 
449       _mm_store_si128((__m128i *)dst, r32l);
450       _mm_store_si128((__m128i *)(dst + 16), r32h);
451       dst += stride;
452       rep = _mm_add_epi16(rep, one);
453     }
454   }
455 }
456 
aom_paeth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
458                                      const uint8_t *above,
459                                      const uint8_t *left) {
460   const __m128i a = _mm_load_si128((const __m128i *)above);
461   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
462   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
463   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
464   const __m128i zero = _mm_setzero_si128();
465   const __m128i al = _mm_unpacklo_epi8(a, zero);
466   const __m128i ah = _mm_unpackhi_epi8(a, zero);
467   const __m128i bl = _mm_unpacklo_epi8(b, zero);
468   const __m128i bh = _mm_unpackhi_epi8(b, zero);
469   const __m128i cl = _mm_unpacklo_epi8(c, zero);
470   const __m128i ch = _mm_unpackhi_epi8(c, zero);
471   const __m128i dl = _mm_unpacklo_epi8(d, zero);
472   const __m128i dh = _mm_unpackhi_epi8(d, zero);
473 
474   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
475   const __m128i one = _mm_set1_epi16(1);
476   __m128i l16;
477 
478   int i, j;
479   for (j = 0; j < 2; ++j) {
480     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
481     __m128i rep = _mm_set1_epi16((short)0x8000);
482     for (i = 0; i < 16; ++i) {
483       l16 = _mm_shuffle_epi8(l, rep);
484       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
485       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
486       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
487       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
488 
489       _mm_store_si128((__m128i *)dst, r0);
490       _mm_store_si128((__m128i *)(dst + 16), r1);
491       _mm_store_si128((__m128i *)(dst + 32), r2);
492       _mm_store_si128((__m128i *)(dst + 48), r3);
493       dst += stride;
494       rep = _mm_add_epi16(rep, one);
495     }
496   }
497 }
498 
aom_paeth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
500                                      const uint8_t *above,
501                                      const uint8_t *left) {
502   const __m128i a = _mm_load_si128((const __m128i *)above);
503   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
504   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
505   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
506   const __m128i zero = _mm_setzero_si128();
507   const __m128i al = _mm_unpacklo_epi8(a, zero);
508   const __m128i ah = _mm_unpackhi_epi8(a, zero);
509   const __m128i bl = _mm_unpacklo_epi8(b, zero);
510   const __m128i bh = _mm_unpackhi_epi8(b, zero);
511   const __m128i cl = _mm_unpacklo_epi8(c, zero);
512   const __m128i ch = _mm_unpackhi_epi8(c, zero);
513   const __m128i dl = _mm_unpacklo_epi8(d, zero);
514   const __m128i dh = _mm_unpackhi_epi8(d, zero);
515 
516   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
517   const __m128i one = _mm_set1_epi16(1);
518   __m128i l16;
519 
520   int i, j;
521   for (j = 0; j < 4; ++j) {
522     const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
523     __m128i rep = _mm_set1_epi16((short)0x8000);
524     for (i = 0; i < 16; ++i) {
525       l16 = _mm_shuffle_epi8(l, rep);
526       const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
527       const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
528       const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
529       const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
530 
531       _mm_store_si128((__m128i *)dst, r0);
532       _mm_store_si128((__m128i *)(dst + 16), r1);
533       _mm_store_si128((__m128i *)(dst + 32), r2);
534       _mm_store_si128((__m128i *)(dst + 48), r3);
535       dst += stride;
536       rep = _mm_add_epi16(rep, one);
537     }
538   }
539 }
540 
aom_paeth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)541 void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
542                                      const uint8_t *above,
543                                      const uint8_t *left) {
544   const __m128i a = _mm_load_si128((const __m128i *)above);
545   const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
546   const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
547   const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
548   const __m128i zero = _mm_setzero_si128();
549   const __m128i al = _mm_unpacklo_epi8(a, zero);
550   const __m128i ah = _mm_unpackhi_epi8(a, zero);
551   const __m128i bl = _mm_unpacklo_epi8(b, zero);
552   const __m128i bh = _mm_unpackhi_epi8(b, zero);
553   const __m128i cl = _mm_unpacklo_epi8(c, zero);
554   const __m128i ch = _mm_unpackhi_epi8(c, zero);
555   const __m128i dl = _mm_unpacklo_epi8(d, zero);
556   const __m128i dh = _mm_unpackhi_epi8(d, zero);
557 
558   const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
559   const __m128i one = _mm_set1_epi16(1);
560   __m128i l16;
561 
562   int i;
563   const __m128i l = _mm_load_si128((const __m128i *)left);
564   __m128i rep = _mm_set1_epi16((short)0x8000);
565   for (i = 0; i < 16; ++i) {
566     l16 = _mm_shuffle_epi8(l, rep);
567     const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
568     const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
569     const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
570     const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
571 
572     _mm_store_si128((__m128i *)dst, r0);
573     _mm_store_si128((__m128i *)(dst + 16), r1);
574     _mm_store_si128((__m128i *)(dst + 32), r2);
575     _mm_store_si128((__m128i *)(dst + 48), r3);
576     dst += stride;
577     rep = _mm_add_epi16(rep, one);
578   }
579 }
580 
581 // -----------------------------------------------------------------------------
582 // SMOOTH_PRED
583 
584 // pixels[0]: above and below_pred interleave vector
585 // pixels[1]: left vector
586 // pixels[2]: right_pred vector
load_pixel_w4(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)587 static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
588                                  int height, __m128i *pixels) {
589   __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
590   if (height == 4)
591     pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
592   else if (height == 8)
593     pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
594   else
595     pixels[1] = _mm_loadu_si128(((const __m128i *)left));
596 
597   pixels[2] = _mm_set1_epi16((int16_t)above[3]);
598 
599   const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
600   const __m128i zero = _mm_setzero_si128();
601   d = _mm_unpacklo_epi8(d, zero);
602   pixels[0] = _mm_unpacklo_epi16(d, bp);
603 }
604 
605 // weight_h[0]: weight_h vector
606 // weight_h[1]: scale - weight_h vector
607 // weight_h[2]: same as [0], second half for height = 16 only
608 // weight_h[3]: same as [1], second half for height = 16 only
609 // weight_w[0]: weights_w and scale - weights_w interleave vector
load_weight_w4(int height,__m128i * weight_h,__m128i * weight_w)610 static INLINE void load_weight_w4(int height, __m128i *weight_h,
611                                   __m128i *weight_w) {
612   const __m128i zero = _mm_setzero_si128();
613   const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
614   const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
615   weight_h[0] = _mm_unpacklo_epi8(t, zero);
616   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
617   weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
618 
619   if (height == 8) {
620     const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
621     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
622     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
623   } else if (height == 16) {
624     const __m128i weight =
625         _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
626     weight_h[0] = _mm_unpacklo_epi8(weight, zero);
627     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
628     weight_h[2] = _mm_unpackhi_epi8(weight, zero);
629     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
630   }
631 }
632 
smooth_pred_4xh(const __m128i * pixel,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)633 static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
634                                    const __m128i *ww, int h, uint8_t *dst,
635                                    ptrdiff_t stride, int second_half) {
636   const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
637   const __m128i one = _mm_set1_epi16(1);
638   const __m128i inc = _mm_set1_epi16(0x202);
639   const __m128i gat = _mm_set1_epi32(0xc080400);
640   __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
641                             : _mm_set1_epi16((short)0x8000);
642   __m128i d = _mm_set1_epi16(0x100);
643 
644   for (int i = 0; i < h; ++i) {
645     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
646     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
647     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
648     __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
649 
650     __m128i b = _mm_shuffle_epi8(pixel[1], rep);
651     b = _mm_unpacklo_epi16(b, pixel[2]);
652     __m128i sum = _mm_madd_epi16(b, ww[0]);
653 
654     sum = _mm_add_epi32(s, sum);
655     sum = _mm_add_epi32(sum, round);
656     sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
657 
658     sum = _mm_shuffle_epi8(sum, gat);
659     *(int *)dst = _mm_cvtsi128_si32(sum);
660     dst += stride;
661 
662     rep = _mm_add_epi16(rep, one);
663     d = _mm_add_epi16(d, inc);
664   }
665 }
666 
aom_smooth_predictor_4x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)667 void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
668                                     const uint8_t *above, const uint8_t *left) {
669   __m128i pixels[3];
670   load_pixel_w4(above, left, 4, pixels);
671 
672   __m128i wh[4], ww[2];
673   load_weight_w4(4, wh, ww);
674 
675   smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
676 }
677 
aom_smooth_predictor_4x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)678 void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
679                                     const uint8_t *above, const uint8_t *left) {
680   __m128i pixels[3];
681   load_pixel_w4(above, left, 8, pixels);
682 
683   __m128i wh[4], ww[2];
684   load_weight_w4(8, wh, ww);
685 
686   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
687 }
688 
aom_smooth_predictor_4x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)689 void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
690                                      const uint8_t *above,
691                                      const uint8_t *left) {
692   __m128i pixels[3];
693   load_pixel_w4(above, left, 16, pixels);
694 
695   __m128i wh[4], ww[2];
696   load_weight_w4(16, wh, ww);
697 
698   smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
699   dst += stride << 3;
700   smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
701 }
702 
703 // pixels[0]: above and below_pred interleave vector, first half
704 // pixels[1]: above and below_pred interleave vector, second half
705 // pixels[2]: left vector
706 // pixels[3]: right_pred vector
707 // pixels[4]: above and below_pred interleave vector, first half
708 // pixels[5]: above and below_pred interleave vector, second half
709 // pixels[6]: left vector + 16
710 // pixels[7]: right_pred vector
load_pixel_w8(const uint8_t * above,const uint8_t * left,int height,__m128i * pixels)711 static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
712                                  int height, __m128i *pixels) {
713   const __m128i zero = _mm_setzero_si128();
714   const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
715   __m128i d = _mm_loadl_epi64((const __m128i *)above);
716   d = _mm_unpacklo_epi8(d, zero);
717   pixels[0] = _mm_unpacklo_epi16(d, bp);
718   pixels[1] = _mm_unpackhi_epi16(d, bp);
719 
720   pixels[3] = _mm_set1_epi16((int16_t)above[7]);
721 
722   if (height == 4) {
723     pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
724   } else if (height == 8) {
725     pixels[2] = _mm_loadl_epi64((const __m128i *)left);
726   } else if (height == 16) {
727     pixels[2] = _mm_load_si128((const __m128i *)left);
728   } else {
729     pixels[2] = _mm_load_si128((const __m128i *)left);
730     pixels[4] = pixels[0];
731     pixels[5] = pixels[1];
732     pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
733     pixels[7] = pixels[3];
734   }
735 }
736 
737 // weight_h[0]: weight_h vector
738 // weight_h[1]: scale - weight_h vector
739 // weight_h[2]: same as [0], offset 8
740 // weight_h[3]: same as [1], offset 8
741 // weight_h[4]: same as [0], offset 16
742 // weight_h[5]: same as [1], offset 16
743 // weight_h[6]: same as [0], offset 24
744 // weight_h[7]: same as [1], offset 24
745 // weight_w[0]: weights_w and scale - weights_w interleave vector, first half
746 // weight_w[1]: weights_w and scale - weights_w interleave vector, second half
load_weight_w8(int height,__m128i * weight_h,__m128i * weight_w)747 static INLINE void load_weight_w8(int height, __m128i *weight_h,
748                                   __m128i *weight_w) {
749   const __m128i zero = _mm_setzero_si128();
750   const int we_offset = height < 8 ? 0 : 4;
751   __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
752   weight_h[0] = _mm_unpacklo_epi8(we, zero);
753   const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
754   weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
755 
756   if (height == 4) {
757     we = _mm_srli_si128(we, 4);
758     __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
759     __m128i tmp2 = _mm_sub_epi16(d, tmp1);
760     weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
761     weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
762   } else {
763     weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
764     weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
765   }
766 
767   if (height == 16) {
768     we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
769     weight_h[0] = _mm_unpacklo_epi8(we, zero);
770     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
771     weight_h[2] = _mm_unpackhi_epi8(we, zero);
772     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
773   } else if (height == 32) {
774     const __m128i weight_lo =
775         _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
776     weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
777     weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
778     weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
779     weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
780     const __m128i weight_hi =
781         _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
782     weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
783     weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
784     weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
785     weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
786   }
787 }
788 
smooth_pred_8xh(const __m128i * pixels,const __m128i * wh,const __m128i * ww,int h,uint8_t * dst,ptrdiff_t stride,int second_half)789 static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
790                                    const __m128i *ww, int h, uint8_t *dst,
791                                    ptrdiff_t stride, int second_half) {
792   const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
793   const __m128i one = _mm_set1_epi16(1);
794   const __m128i inc = _mm_set1_epi16(0x202);
795   const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
796 
797   __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
798                             : _mm_set1_epi16((short)0x8000);
799   __m128i d = _mm_set1_epi16(0x100);
800 
801   int i;
802   for (i = 0; i < h; ++i) {
803     const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
804     const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
805     const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
806     __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
807     __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
808 
809     __m128i b = _mm_shuffle_epi8(pixels[2], rep);
810     b = _mm_unpacklo_epi16(b, pixels[3]);
811     __m128i sum0 = _mm_madd_epi16(b, ww[0]);
812     __m128i sum1 = _mm_madd_epi16(b, ww[1]);
813 
814     s0 = _mm_add_epi32(s0, sum0);
815     s0 = _mm_add_epi32(s0, round);
816     s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
817 
818     s1 = _mm_add_epi32(s1, sum1);
819     s1 = _mm_add_epi32(s1, round);
820     s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
821 
822     sum0 = _mm_packus_epi16(s0, s1);
823     sum0 = _mm_shuffle_epi8(sum0, gat);
824     _mm_storel_epi64((__m128i *)dst, sum0);
825     dst += stride;
826 
827     rep = _mm_add_epi16(rep, one);
828     d = _mm_add_epi16(d, inc);
829   }
830 }
831 
aom_smooth_predictor_8x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)832 void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
833                                     const uint8_t *above, const uint8_t *left) {
834   __m128i pixels[4];
835   load_pixel_w8(above, left, 4, pixels);
836 
837   __m128i wh[4], ww[2];
838   load_weight_w8(4, wh, ww);
839 
840   smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
841 }
842 
aom_smooth_predictor_8x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)843 void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
844                                     const uint8_t *above, const uint8_t *left) {
845   __m128i pixels[4];
846   load_pixel_w8(above, left, 8, pixels);
847 
848   __m128i wh[4], ww[2];
849   load_weight_w8(8, wh, ww);
850 
851   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
852 }
853 
aom_smooth_predictor_8x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)854 void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
855                                      const uint8_t *above,
856                                      const uint8_t *left) {
857   __m128i pixels[4];
858   load_pixel_w8(above, left, 16, pixels);
859 
860   __m128i wh[4], ww[2];
861   load_weight_w8(16, wh, ww);
862 
863   smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
864   dst += stride << 3;
865   smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
866 }
867 
aom_smooth_predictor_8x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)868 void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
869                                      const uint8_t *above,
870                                      const uint8_t *left) {
871   __m128i pixels[8];
872   load_pixel_w8(above, left, 32, pixels);
873 
874   __m128i wh[8], ww[2];
875   load_weight_w8(32, wh, ww);
876 
877   smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
878   dst += stride << 3;
879   smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
880   dst += stride << 3;
881   smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
882   dst += stride << 3;
883   smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
884 }
885 
886 // TODO(slavarnway): Visual Studio only supports restrict when /std:c11
887 // (available in 2019+) or greater is specified; __restrict can be used in that
888 // case. This should be moved to rtcd and used consistently between the
889 // function declarations and definitions to avoid warnings in Visual Studio
890 // when defining LIBAOM_RESTRICT to restrict or __restrict.
891 #if defined(_MSC_VER)
892 #define LIBAOM_RESTRICT
893 #else
894 #define LIBAOM_RESTRICT restrict
895 #endif
896 
Load4(const void * src)897 static AOM_FORCE_INLINE __m128i Load4(const void *src) {
898   // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
899   // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
900   // movss instruction.
901   //
902   // Until compiler support of _mm_loadu_si32 is widespread, use of
903   // _mm_loadu_si32 is banned.
904   int val;
905   memcpy(&val, src, sizeof(val));
906   return _mm_cvtsi32_si128(val);
907 }
908 
LoadLo8(const void * a)909 static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
910   return _mm_loadl_epi64((const __m128i *)(a));
911 }
912 
LoadUnaligned16(const void * a)913 static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
914   return _mm_loadu_si128((const __m128i *)(a));
915 }
916 
Store4(void * dst,const __m128i x)917 static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
918   const int val = _mm_cvtsi128_si32(x);
919   memcpy(dst, &val, sizeof(val));
920 }
921 
StoreLo8(void * a,const __m128i v)922 static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
923   _mm_storel_epi64((__m128i *)(a), v);
924 }
925 
StoreUnaligned16(void * a,const __m128i v)926 static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
927   _mm_storeu_si128((__m128i *)(a), v);
928 }
929 
cvtepu8_epi16(__m128i x)930 static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
931   return _mm_unpacklo_epi8((x), _mm_setzero_si128());
932 }
933 
cvtepu8_epi32(__m128i x)934 static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
935   const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
936   return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
937 }
938 
cvtepu16_epi32(__m128i x)939 static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
940   return _mm_unpacklo_epi16((x), _mm_setzero_si128());
941 }
942 
smooth_predictor_wxh(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column,int width,int height)943 void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
944                           const uint8_t *LIBAOM_RESTRICT top_row,
945                           const uint8_t *LIBAOM_RESTRICT left_column, int width,
946                           int height) {
947   const uint8_t *const sm_weights_h = smooth_weights + height - 4;
948   const uint8_t *const sm_weights_w = smooth_weights + width - 4;
949   const __m128i zero = _mm_setzero_si128();
950   const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
951   const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
952   const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
953   const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
954   for (int y = 0; y < height; ++y) {
955     const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
956     const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
957     const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
958     __m128i scaled_bottom_left =
959         _mm_mullo_epi16(scale_m_weights_y, bottom_left);
960     const __m128i weight_left_y =
961         _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
962     scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
963     scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
964     for (int x = 0; x < width; x += 8) {
965       const __m128i top_x = LoadLo8(top_row + x);
966       const __m128i weights_x = LoadLo8(sm_weights_w + x);
967       const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
968       const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
969       const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
970 
971       // Here opposite weights and pixels are multiplied, where the order of
972       // interleaving is indicated in the names.
973       __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
974       __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
975 
976       // |scaled_bottom_left| is always scaled by the same weight each row, so
977       // we only derive |scaled_top_right| values here.
978       const __m128i inverted_weights_x =
979           _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
980       const __m128i scaled_top_right =
981           _mm_mullo_epi16(inverted_weights_x, top_right);
982       const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
983       const __m128i scaled_top_right_hi =
984           _mm_unpackhi_epi16(scaled_top_right, zero);
985       pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
986       pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
987       pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
988       pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
989 
990       // The round value for RightShiftWithRounding was added with
991       // |scaled_bottom_left|.
992       pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
993       pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
994       const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
995       StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
996     }
997     dst += stride;
998   }
999 }
1000 
aom_smooth_predictor_16x4_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1001 void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
1002                                      const uint8_t *above,
1003                                      const uint8_t *left) {
1004   smooth_predictor_wxh(dst, stride, above, left, 16, 4);
1005 }
1006 
aom_smooth_predictor_16x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1007 void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1008                                      const uint8_t *above,
1009                                      const uint8_t *left) {
1010   smooth_predictor_wxh(dst, stride, above, left, 16, 8);
1011 }
1012 
aom_smooth_predictor_16x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1013 void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1014                                       const uint8_t *above,
1015                                       const uint8_t *left) {
1016   smooth_predictor_wxh(dst, stride, above, left, 16, 16);
1017 }
1018 
aom_smooth_predictor_16x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1019 void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1020                                       const uint8_t *above,
1021                                       const uint8_t *left) {
1022   smooth_predictor_wxh(dst, stride, above, left, 16, 32);
1023 }
1024 
aom_smooth_predictor_16x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1025 void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1026                                       const uint8_t *above,
1027                                       const uint8_t *left) {
1028   smooth_predictor_wxh(dst, stride, above, left, 16, 64);
1029 }
1030 
aom_smooth_predictor_32x8_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1031 void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
1032                                      const uint8_t *above,
1033                                      const uint8_t *left) {
1034   smooth_predictor_wxh(dst, stride, above, left, 32, 8);
1035 }
1036 
aom_smooth_predictor_32x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1037 void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1038                                       const uint8_t *above,
1039                                       const uint8_t *left) {
1040   smooth_predictor_wxh(dst, stride, above, left, 32, 16);
1041 }
1042 
aom_smooth_predictor_32x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1043 void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1044                                       const uint8_t *above,
1045                                       const uint8_t *left) {
1046   smooth_predictor_wxh(dst, stride, above, left, 32, 32);
1047 }
1048 
aom_smooth_predictor_32x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1049 void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1050                                       const uint8_t *above,
1051                                       const uint8_t *left) {
1052   smooth_predictor_wxh(dst, stride, above, left, 32, 64);
1053 }
1054 
aom_smooth_predictor_64x16_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1055 void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
1056                                       const uint8_t *above,
1057                                       const uint8_t *left) {
1058   smooth_predictor_wxh(dst, stride, above, left, 64, 16);
1059 }
1060 
aom_smooth_predictor_64x32_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1061 void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
1062                                       const uint8_t *above,
1063                                       const uint8_t *left) {
1064   smooth_predictor_wxh(dst, stride, above, left, 64, 32);
1065 }
1066 
aom_smooth_predictor_64x64_ssse3(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1067 void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
1068                                       const uint8_t *above,
1069                                       const uint8_t *left) {
1070   smooth_predictor_wxh(dst, stride, above, left, 64, 64);
1071 }
1072 
1073 // -----------------------------------------------------------------------------
1074 // Smooth horizontal/vertical helper functions.
1075 
1076 // For Horizontal, pixels1 and pixels2 are the same repeated value. For
1077 // Vertical, weights1 and weights2 are the same, and scaled_corner1 and
1078 // scaled_corner2 are the same.
write_smooth_directional_sum16(uint8_t * LIBAOM_RESTRICT dst,const __m128i pixels1,const __m128i pixels2,const __m128i weights1,const __m128i weights2,const __m128i scaled_corner1,const __m128i scaled_corner2,const __m128i round)1079 static AOM_FORCE_INLINE void write_smooth_directional_sum16(
1080     uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
1081     const __m128i weights1, const __m128i weights2,
1082     const __m128i scaled_corner1, const __m128i scaled_corner2,
1083     const __m128i round) {
1084   const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
1085   const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
1086   const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
1087   const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
1088   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1089   const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
1090   const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
1091   StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
1092 }
1093 
smooth_directional_sum8(const __m128i pixels,const __m128i weights,const __m128i scaled_corner)1094 static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
1095     const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
1096   const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
1097   return _mm_add_epi16(scaled_corner, weighted_px);
1098 }
1099 
write_smooth_directional_sum8(uint8_t * LIBAOM_RESTRICT dst,const __m128i * pixels,const __m128i * weights,const __m128i * scaled_corner,const __m128i * round)1100 static AOM_FORCE_INLINE void write_smooth_directional_sum8(
1101     uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
1102     const __m128i *scaled_corner, const __m128i *round) {
1103   const __m128i pred_sum =
1104       smooth_directional_sum8(*pixels, *weights, *scaled_corner);
1105   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
1106   const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
1107   StoreLo8(dst, _mm_packus_epi16(pred, pred));
1108 }
1109 
1110 // -----------------------------------------------------------------------------
1111 // SMOOTH_V_PRED
1112 
load_smooth_vertical_pixels4(const uint8_t * LIBAOM_RESTRICT above,const uint8_t * LIBAOM_RESTRICT left,const int height,__m128i * pixels)1113 static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
1114     const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
1115     const int height, __m128i *pixels) {
1116   __m128i top = Load4(above);
1117   const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
1118   top = cvtepu8_epi16(top);
1119   pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
1120 }
1121 
1122 // |weight_array| alternates weight vectors from the table with their inverted
1123 // (256-w) counterparts. This is precomputed by the compiler when the weights
1124 // table is visible to this module. Removing this visibility can cut speed by up
1125 // to half in both 4xH and 8xH transforms.
load_smooth_vertical_weights4(const uint8_t * LIBAOM_RESTRICT weight_array,const int height,__m128i * weights)1126 static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
1127     const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
1128     __m128i *weights) {
1129   const __m128i inverter = _mm_set1_epi16(256);
1130 
1131   if (height == 4) {
1132     const __m128i weight = Load4(weight_array);
1133     weights[0] = cvtepu8_epi16(weight);
1134     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1135   } else if (height == 8) {
1136     const __m128i weight = LoadLo8(weight_array + 4);
1137     weights[0] = cvtepu8_epi16(weight);
1138     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1139   } else {
1140     const __m128i weight = LoadUnaligned16(weight_array + 12);
1141     const __m128i zero = _mm_setzero_si128();
1142     weights[0] = cvtepu8_epi16(weight);
1143     weights[1] = _mm_sub_epi16(inverter, weights[0]);
1144     weights[2] = _mm_unpackhi_epi8(weight, zero);
1145     weights[3] = _mm_sub_epi16(inverter, weights[2]);
1146   }
1147 }
1148 
write_smooth_vertical4xh(const __m128i * pixel,const __m128i * weight,const int height,uint8_t * LIBAOM_RESTRICT dst,const ptrdiff_t stride)1149 static AOM_FORCE_INLINE void write_smooth_vertical4xh(
1150     const __m128i *pixel, const __m128i *weight, const int height,
1151     uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
1152   const __m128i pred_round = _mm_set1_epi32(128);
1153   const __m128i mask_increment = _mm_set1_epi16(0x0202);
1154   const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
1155   __m128i y_select = _mm_set1_epi16(0x0100);
1156 
1157   for (int y = 0; y < height; ++y) {
1158     const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
1159     const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
1160     const __m128i alternate_weights =
1161         _mm_unpacklo_epi16(weight_y, inverted_weight_y);
1162     // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
1163     // The madd instruction yields four results of the form:
1164     // (top_row[x] * weight[y] + corner * inverted_weight[y])
1165     __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
1166     sum = _mm_add_epi32(sum, pred_round);
1167     sum = _mm_srai_epi32(sum, 8);
1168     sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
1169     Store4(dst, sum);
1170     dst += stride;
1171     y_select = _mm_add_epi16(y_select, mask_increment);
1172   }
1173 }
1174 
aom_smooth_v_predictor_4x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1175 void aom_smooth_v_predictor_4x4_ssse3(
1176     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1177     const uint8_t *LIBAOM_RESTRICT top_row,
1178     const uint8_t *LIBAOM_RESTRICT left_column) {
1179   __m128i pixels;
1180   load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
1181 
1182   __m128i weights[2];
1183   load_smooth_vertical_weights4(smooth_weights, 4, weights);
1184 
1185   write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
1186 }
1187 
aom_smooth_v_predictor_4x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1188 void aom_smooth_v_predictor_4x8_ssse3(
1189     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1190     const uint8_t *LIBAOM_RESTRICT top_row,
1191     const uint8_t *LIBAOM_RESTRICT left_column) {
1192   __m128i pixels;
1193   load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
1194 
1195   __m128i weights[2];
1196   load_smooth_vertical_weights4(smooth_weights, 8, weights);
1197 
1198   write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1199 }
1200 
aom_smooth_v_predictor_4x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1201 void aom_smooth_v_predictor_4x16_ssse3(
1202     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1203     const uint8_t *LIBAOM_RESTRICT top_row,
1204     const uint8_t *LIBAOM_RESTRICT left_column) {
1205   __m128i pixels;
1206   load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
1207 
1208   __m128i weights[4];
1209   load_smooth_vertical_weights4(smooth_weights, 16, weights);
1210 
1211   write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
1212   dst += stride << 3;
1213   write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
1214 }
1215 
aom_smooth_v_predictor_8x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1216 void aom_smooth_v_predictor_8x4_ssse3(
1217     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1218     const uint8_t *LIBAOM_RESTRICT top_row,
1219     const uint8_t *LIBAOM_RESTRICT left_column) {
1220   const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1221   const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1222   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1223   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1224   const __m128i scaled_bottom_left =
1225       _mm_mullo_epi16(inverted_weights, bottom_left);
1226   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1227   __m128i y_select = _mm_set1_epi32(0x01000100);
1228   const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1229   __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1230   __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1231   write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1232                                 &round);
1233   dst += stride;
1234   y_select = _mm_set1_epi32(0x03020302);
1235   weights_y = _mm_shuffle_epi8(weights, y_select);
1236   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1237   write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1238                                 &round);
1239   dst += stride;
1240   y_select = _mm_set1_epi32(0x05040504);
1241   weights_y = _mm_shuffle_epi8(weights, y_select);
1242   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1243   write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1244                                 &round);
1245   dst += stride;
1246   y_select = _mm_set1_epi32(0x07060706);
1247   weights_y = _mm_shuffle_epi8(weights, y_select);
1248   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1249   write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1250                                 &round);
1251 }
1252 
aom_smooth_v_predictor_8x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1253 void aom_smooth_v_predictor_8x8_ssse3(
1254     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1255     const uint8_t *LIBAOM_RESTRICT top_row,
1256     const uint8_t *LIBAOM_RESTRICT left_column) {
1257   const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1258   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1259   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1260   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1261   const __m128i scaled_bottom_left =
1262       _mm_mullo_epi16(inverted_weights, bottom_left);
1263   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1264   const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1265   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1266     const __m128i y_select = _mm_set1_epi32(y_mask);
1267     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1268     const __m128i scaled_bottom_left_y =
1269         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1270     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1271                                   &round);
1272     dst += stride;
1273   }
1274 }
1275 
aom_smooth_v_predictor_8x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1276 void aom_smooth_v_predictor_8x16_ssse3(
1277     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1278     const uint8_t *LIBAOM_RESTRICT top_row,
1279     const uint8_t *LIBAOM_RESTRICT left_column) {
1280   const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1281   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1282 
1283   const __m128i weights1 = cvtepu8_epi16(weights);
1284   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
1285   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1286   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1287   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1288   const __m128i scaled_bottom_left1 =
1289       _mm_mullo_epi16(inverted_weights1, bottom_left);
1290   const __m128i scaled_bottom_left2 =
1291       _mm_mullo_epi16(inverted_weights2, bottom_left);
1292   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1293   const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1294   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1295     const __m128i y_select = _mm_set1_epi32(y_mask);
1296     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1297     const __m128i scaled_bottom_left_y =
1298         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1299     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1300                                   &round);
1301     dst += stride;
1302   }
1303   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1304     const __m128i y_select = _mm_set1_epi32(y_mask);
1305     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1306     const __m128i scaled_bottom_left_y =
1307         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1308     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1309                                   &round);
1310     dst += stride;
1311   }
1312 }
1313 
aom_smooth_v_predictor_8x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1314 void aom_smooth_v_predictor_8x32_ssse3(
1315     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1316     const uint8_t *LIBAOM_RESTRICT top_row,
1317     const uint8_t *LIBAOM_RESTRICT left_column) {
1318   const __m128i zero = _mm_setzero_si128();
1319   const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1320   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1321   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1322   const __m128i weights1 = cvtepu8_epi16(weights_lo);
1323   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1324   const __m128i weights3 = cvtepu8_epi16(weights_hi);
1325   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1326   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1327   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1328   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1329   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1330   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1331   const __m128i scaled_bottom_left1 =
1332       _mm_mullo_epi16(inverted_weights1, bottom_left);
1333   const __m128i scaled_bottom_left2 =
1334       _mm_mullo_epi16(inverted_weights2, bottom_left);
1335   const __m128i scaled_bottom_left3 =
1336       _mm_mullo_epi16(inverted_weights3, bottom_left);
1337   const __m128i scaled_bottom_left4 =
1338       _mm_mullo_epi16(inverted_weights4, bottom_left);
1339   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1340   const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
1341   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1342     const __m128i y_select = _mm_set1_epi32(y_mask);
1343     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1344     const __m128i scaled_bottom_left_y =
1345         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1346     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1347                                   &round);
1348     dst += stride;
1349   }
1350   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1351     const __m128i y_select = _mm_set1_epi32(y_mask);
1352     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1353     const __m128i scaled_bottom_left_y =
1354         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1355     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1356                                   &round);
1357     dst += stride;
1358   }
1359   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1360     const __m128i y_select = _mm_set1_epi32(y_mask);
1361     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1362     const __m128i scaled_bottom_left_y =
1363         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1364     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1365                                   &round);
1366     dst += stride;
1367   }
1368   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1369     const __m128i y_select = _mm_set1_epi32(y_mask);
1370     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1371     const __m128i scaled_bottom_left_y =
1372         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1373     write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
1374                                   &round);
1375     dst += stride;
1376   }
1377 }
1378 
aom_smooth_v_predictor_16x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1379 void aom_smooth_v_predictor_16x4_ssse3(
1380     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1381     const uint8_t *LIBAOM_RESTRICT top_row,
1382     const uint8_t *LIBAOM_RESTRICT left_column) {
1383   const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
1384   const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
1385   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1386   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1387   const __m128i scaled_bottom_left =
1388       _mm_mullo_epi16(inverted_weights, bottom_left);
1389   const __m128i round = _mm_set1_epi16(128);
1390   const __m128i top = LoadUnaligned16(top_row);
1391   const __m128i top_lo = cvtepu8_epi16(top);
1392   const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1393 
1394   __m128i y_select = _mm_set1_epi32(0x01000100);
1395   __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1396   __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1397   write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1398                                  scaled_bottom_left_y, scaled_bottom_left_y,
1399                                  round);
1400   dst += stride;
1401   y_select = _mm_set1_epi32(0x03020302);
1402   weights_y = _mm_shuffle_epi8(weights, y_select);
1403   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1404   write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1405                                  scaled_bottom_left_y, scaled_bottom_left_y,
1406                                  round);
1407   dst += stride;
1408   y_select = _mm_set1_epi32(0x05040504);
1409   weights_y = _mm_shuffle_epi8(weights, y_select);
1410   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1411   write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1412                                  scaled_bottom_left_y, scaled_bottom_left_y,
1413                                  round);
1414   dst += stride;
1415   y_select = _mm_set1_epi32(0x07060706);
1416   weights_y = _mm_shuffle_epi8(weights, y_select);
1417   scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
1418   write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1419                                  scaled_bottom_left_y, scaled_bottom_left_y,
1420                                  round);
1421 }
1422 
aom_smooth_v_predictor_16x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1423 void aom_smooth_v_predictor_16x8_ssse3(
1424     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1425     const uint8_t *LIBAOM_RESTRICT top_row,
1426     const uint8_t *LIBAOM_RESTRICT left_column) {
1427   const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1428   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1429   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1430   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1431   const __m128i scaled_bottom_left =
1432       _mm_mullo_epi16(inverted_weights, bottom_left);
1433   const __m128i round = _mm_set1_epi16(128);
1434   const __m128i top = LoadUnaligned16(top_row);
1435   const __m128i top_lo = cvtepu8_epi16(top);
1436   const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
1437   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1438     const __m128i y_select = _mm_set1_epi32(y_mask);
1439     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1440     const __m128i scaled_bottom_left_y =
1441         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1442     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1443                                    scaled_bottom_left_y, scaled_bottom_left_y,
1444                                    round);
1445     dst += stride;
1446   }
1447 }
1448 
aom_smooth_v_predictor_16x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1449 void aom_smooth_v_predictor_16x16_ssse3(
1450     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1451     const uint8_t *LIBAOM_RESTRICT top_row,
1452     const uint8_t *LIBAOM_RESTRICT left_column) {
1453   const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1454   const __m128i zero = _mm_setzero_si128();
1455   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1456   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1457   const __m128i weights_lo = cvtepu8_epi16(weights);
1458   const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1459   const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1460   const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1461   const __m128i scaled_bottom_left_lo =
1462       _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1463   const __m128i scaled_bottom_left_hi =
1464       _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1465   const __m128i round = _mm_set1_epi16(128);
1466 
1467   const __m128i top = LoadUnaligned16(top_row);
1468   const __m128i top_lo = cvtepu8_epi16(top);
1469   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1470   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1471     const __m128i y_select = _mm_set1_epi32(y_mask);
1472     const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1473     const __m128i scaled_bottom_left_y =
1474         _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1475     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1476                                    scaled_bottom_left_y, scaled_bottom_left_y,
1477                                    round);
1478     dst += stride;
1479   }
1480   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1481     const __m128i y_select = _mm_set1_epi32(y_mask);
1482     const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1483     const __m128i scaled_bottom_left_y =
1484         _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1485     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1486                                    scaled_bottom_left_y, scaled_bottom_left_y,
1487                                    round);
1488     dst += stride;
1489   }
1490 }
1491 
aom_smooth_v_predictor_16x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1492 void aom_smooth_v_predictor_16x32_ssse3(
1493     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1494     const uint8_t *LIBAOM_RESTRICT top_row,
1495     const uint8_t *LIBAOM_RESTRICT left_column) {
1496   const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1497   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1498   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1499   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1500   const __m128i zero = _mm_setzero_si128();
1501   const __m128i weights1 = cvtepu8_epi16(weights_lo);
1502   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1503   const __m128i weights3 = cvtepu8_epi16(weights_hi);
1504   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1505   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1506   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1507   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1508   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1509   const __m128i scaled_bottom_left1 =
1510       _mm_mullo_epi16(inverted_weights1, bottom_left);
1511   const __m128i scaled_bottom_left2 =
1512       _mm_mullo_epi16(inverted_weights2, bottom_left);
1513   const __m128i scaled_bottom_left3 =
1514       _mm_mullo_epi16(inverted_weights3, bottom_left);
1515   const __m128i scaled_bottom_left4 =
1516       _mm_mullo_epi16(inverted_weights4, bottom_left);
1517   const __m128i round = _mm_set1_epi16(128);
1518 
1519   const __m128i top = LoadUnaligned16(top_row);
1520   const __m128i top_lo = cvtepu8_epi16(top);
1521   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1522   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1523     const __m128i y_select = _mm_set1_epi32(y_mask);
1524     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1525     const __m128i scaled_bottom_left_y =
1526         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1527     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1528                                    scaled_bottom_left_y, scaled_bottom_left_y,
1529                                    round);
1530     dst += stride;
1531   }
1532   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1533     const __m128i y_select = _mm_set1_epi32(y_mask);
1534     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1535     const __m128i scaled_bottom_left_y =
1536         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1537     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1538                                    scaled_bottom_left_y, scaled_bottom_left_y,
1539                                    round);
1540     dst += stride;
1541   }
1542   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1543     const __m128i y_select = _mm_set1_epi32(y_mask);
1544     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1545     const __m128i scaled_bottom_left_y =
1546         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1547     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1548                                    scaled_bottom_left_y, scaled_bottom_left_y,
1549                                    round);
1550     dst += stride;
1551   }
1552   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1553     const __m128i y_select = _mm_set1_epi32(y_mask);
1554     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1555     const __m128i scaled_bottom_left_y =
1556         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1557     write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1558                                    scaled_bottom_left_y, scaled_bottom_left_y,
1559                                    round);
1560     dst += stride;
1561   }
1562 }
1563 
aom_smooth_v_predictor_16x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1564 void aom_smooth_v_predictor_16x64_ssse3(
1565     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1566     const uint8_t *LIBAOM_RESTRICT top_row,
1567     const uint8_t *LIBAOM_RESTRICT left_column) {
1568   const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1569   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1570   const __m128i round = _mm_set1_epi16(128);
1571   const __m128i zero = _mm_setzero_si128();
1572   const __m128i top = LoadUnaligned16(top_row);
1573   const __m128i top_lo = cvtepu8_epi16(top);
1574   const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
1575   const uint8_t *weights_base_ptr = smooth_weights + 60;
1576   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1577     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1578     const __m128i weights_lo = cvtepu8_epi16(weights);
1579     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1580     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1581     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1582     const __m128i scaled_bottom_left_lo =
1583         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1584     const __m128i scaled_bottom_left_hi =
1585         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1586 
1587     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1588       const __m128i y_select = _mm_set1_epi32(y_mask);
1589       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1590       const __m128i scaled_bottom_left_y =
1591           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1592       write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1593                                      scaled_bottom_left_y, scaled_bottom_left_y,
1594                                      round);
1595       dst += stride;
1596     }
1597     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1598       const __m128i y_select = _mm_set1_epi32(y_mask);
1599       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1600       const __m128i scaled_bottom_left_y =
1601           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1602       write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
1603                                      scaled_bottom_left_y, scaled_bottom_left_y,
1604                                      round);
1605       dst += stride;
1606     }
1607   }
1608 }
1609 
aom_smooth_v_predictor_32x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1610 void aom_smooth_v_predictor_32x8_ssse3(
1611     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1612     const uint8_t *LIBAOM_RESTRICT top_row,
1613     const uint8_t *LIBAOM_RESTRICT left_column) {
1614   const __m128i zero = _mm_setzero_si128();
1615   const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
1616   const __m128i top_lo = LoadUnaligned16(top_row);
1617   const __m128i top_hi = LoadUnaligned16(top_row + 16);
1618   const __m128i top1 = cvtepu8_epi16(top_lo);
1619   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1620   const __m128i top3 = cvtepu8_epi16(top_hi);
1621   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1622   __m128i scale = _mm_set1_epi16(256);
1623   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
1624   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
1625   const __m128i scaled_bottom_left =
1626       _mm_mullo_epi16(inverted_weights, bottom_left);
1627   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1628   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1629     __m128i y_select = _mm_set1_epi32(y_mask);
1630     const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
1631     const __m128i scaled_bottom_left_y =
1632         _mm_shuffle_epi8(scaled_bottom_left, y_select);
1633     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1634                                    scaled_bottom_left_y, scaled_bottom_left_y,
1635                                    round);
1636     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1637                                    scaled_bottom_left_y, scaled_bottom_left_y,
1638                                    round);
1639     dst += stride;
1640   }
1641 }
1642 
aom_smooth_v_predictor_32x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1643 void aom_smooth_v_predictor_32x16_ssse3(
1644     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1645     const uint8_t *LIBAOM_RESTRICT top_row,
1646     const uint8_t *LIBAOM_RESTRICT left_column) {
1647   const __m128i zero = _mm_setzero_si128();
1648   const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1649   const __m128i top_lo = LoadUnaligned16(top_row);
1650   const __m128i top_hi = LoadUnaligned16(top_row + 16);
1651   const __m128i top1 = cvtepu8_epi16(top_lo);
1652   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1653   const __m128i top3 = cvtepu8_epi16(top_hi);
1654   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1655   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1656   const __m128i weights1 = cvtepu8_epi16(weights);
1657   const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1658   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1659   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1660   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1661   const __m128i scaled_bottom_left1 =
1662       _mm_mullo_epi16(inverted_weights1, bottom_left);
1663   const __m128i scaled_bottom_left2 =
1664       _mm_mullo_epi16(inverted_weights2, bottom_left);
1665   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1666   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1667     __m128i y_select = _mm_set1_epi32(y_mask);
1668     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1669     const __m128i scaled_bottom_left_y =
1670         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1671     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1672                                    scaled_bottom_left_y, scaled_bottom_left_y,
1673                                    round);
1674     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1675                                    scaled_bottom_left_y, scaled_bottom_left_y,
1676                                    round);
1677     dst += stride;
1678   }
1679   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1680     __m128i y_select = _mm_set1_epi32(y_mask);
1681     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1682     const __m128i scaled_bottom_left_y =
1683         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1684     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1685                                    scaled_bottom_left_y, scaled_bottom_left_y,
1686                                    round);
1687     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1688                                    scaled_bottom_left_y, scaled_bottom_left_y,
1689                                    round);
1690     dst += stride;
1691   }
1692 }
1693 
aom_smooth_v_predictor_32x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1694 void aom_smooth_v_predictor_32x32_ssse3(
1695     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1696     const uint8_t *LIBAOM_RESTRICT top_row,
1697     const uint8_t *LIBAOM_RESTRICT left_column) {
1698   const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1699   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1700   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1701   const __m128i zero = _mm_setzero_si128();
1702   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1703   const __m128i top_lo = LoadUnaligned16(top_row);
1704   const __m128i top_hi = LoadUnaligned16(top_row + 16);
1705   const __m128i top1 = cvtepu8_epi16(top_lo);
1706   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1707   const __m128i top3 = cvtepu8_epi16(top_hi);
1708   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1709   const __m128i weights1 = cvtepu8_epi16(weights_lo);
1710   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1711   const __m128i weights3 = cvtepu8_epi16(weights_hi);
1712   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1713   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1714   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1715   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1716   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1717   const __m128i scaled_bottom_left1 =
1718       _mm_mullo_epi16(inverted_weights1, bottom_left);
1719   const __m128i scaled_bottom_left2 =
1720       _mm_mullo_epi16(inverted_weights2, bottom_left);
1721   const __m128i scaled_bottom_left3 =
1722       _mm_mullo_epi16(inverted_weights3, bottom_left);
1723   const __m128i scaled_bottom_left4 =
1724       _mm_mullo_epi16(inverted_weights4, bottom_left);
1725   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1726   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1727     const __m128i y_select = _mm_set1_epi32(y_mask);
1728     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1729     const __m128i scaled_bottom_left_y =
1730         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1731     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1732                                    scaled_bottom_left_y, scaled_bottom_left_y,
1733                                    round);
1734     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1735                                    scaled_bottom_left_y, scaled_bottom_left_y,
1736                                    round);
1737     dst += stride;
1738   }
1739   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1740     const __m128i y_select = _mm_set1_epi32(y_mask);
1741     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1742     const __m128i scaled_bottom_left_y =
1743         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1744     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1745                                    scaled_bottom_left_y, scaled_bottom_left_y,
1746                                    round);
1747     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1748                                    scaled_bottom_left_y, scaled_bottom_left_y,
1749                                    round);
1750     dst += stride;
1751   }
1752   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1753     const __m128i y_select = _mm_set1_epi32(y_mask);
1754     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1755     const __m128i scaled_bottom_left_y =
1756         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1757     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1758                                    scaled_bottom_left_y, scaled_bottom_left_y,
1759                                    round);
1760     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1761                                    scaled_bottom_left_y, scaled_bottom_left_y,
1762                                    round);
1763     dst += stride;
1764   }
1765   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1766     const __m128i y_select = _mm_set1_epi32(y_mask);
1767     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
1768     const __m128i scaled_bottom_left_y =
1769         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
1770     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1771                                    scaled_bottom_left_y, scaled_bottom_left_y,
1772                                    round);
1773     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1774                                    scaled_bottom_left_y, scaled_bottom_left_y,
1775                                    round);
1776     dst += stride;
1777   }
1778 }
1779 
aom_smooth_v_predictor_32x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1780 void aom_smooth_v_predictor_32x64_ssse3(
1781     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1782     const uint8_t *LIBAOM_RESTRICT top_row,
1783     const uint8_t *LIBAOM_RESTRICT left_column) {
1784   const __m128i zero = _mm_setzero_si128();
1785   const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
1786   const __m128i top_lo = LoadUnaligned16(top_row);
1787   const __m128i top_hi = LoadUnaligned16(top_row + 16);
1788   const __m128i top1 = cvtepu8_epi16(top_lo);
1789   const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
1790   const __m128i top3 = cvtepu8_epi16(top_hi);
1791   const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
1792   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1793   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1794   const uint8_t *weights_base_ptr = smooth_weights + 60;
1795   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
1796     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
1797     const __m128i weights_lo = cvtepu8_epi16(weights);
1798     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
1799     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
1800     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
1801     const __m128i scaled_bottom_left_lo =
1802         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
1803     const __m128i scaled_bottom_left_hi =
1804         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
1805 
1806     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1807       const __m128i y_select = _mm_set1_epi32(y_mask);
1808       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
1809       const __m128i scaled_bottom_left_y =
1810           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
1811       write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1812                                      scaled_bottom_left_y, scaled_bottom_left_y,
1813                                      round);
1814       write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1815                                      scaled_bottom_left_y, scaled_bottom_left_y,
1816                                      round);
1817       dst += stride;
1818     }
1819     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1820       const __m128i y_select = _mm_set1_epi32(y_mask);
1821       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
1822       const __m128i scaled_bottom_left_y =
1823           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
1824       write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1825                                      scaled_bottom_left_y, scaled_bottom_left_y,
1826                                      round);
1827       write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1828                                      scaled_bottom_left_y, scaled_bottom_left_y,
1829                                      round);
1830       dst += stride;
1831     }
1832   }
1833 }
1834 
aom_smooth_v_predictor_64x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1835 void aom_smooth_v_predictor_64x16_ssse3(
1836     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1837     const uint8_t *LIBAOM_RESTRICT top_row,
1838     const uint8_t *LIBAOM_RESTRICT left_column) {
1839   const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
1840   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1841   const __m128i zero = _mm_setzero_si128();
1842   const __m128i top_lolo = LoadUnaligned16(top_row);
1843   const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1844   const __m128i top1 = cvtepu8_epi16(top_lolo);
1845   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1846   const __m128i top3 = cvtepu8_epi16(top_lohi);
1847   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1848 
1849   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
1850   const __m128i weights1 = cvtepu8_epi16(weights);
1851   const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
1852   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1853   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1854   const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1855   const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1856   const __m128i top5 = cvtepu8_epi16(top_hilo);
1857   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1858   const __m128i top7 = cvtepu8_epi16(top_hihi);
1859   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1860   const __m128i scaled_bottom_left1 =
1861       _mm_mullo_epi16(inverted_weights1, bottom_left);
1862   const __m128i scaled_bottom_left2 =
1863       _mm_mullo_epi16(inverted_weights2, bottom_left);
1864   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1865   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1866     const __m128i y_select = _mm_set1_epi32(y_mask);
1867     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1868     const __m128i scaled_bottom_left_y =
1869         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1870     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1871                                    scaled_bottom_left_y, scaled_bottom_left_y,
1872                                    round);
1873     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1874                                    scaled_bottom_left_y, scaled_bottom_left_y,
1875                                    round);
1876     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1877                                    scaled_bottom_left_y, scaled_bottom_left_y,
1878                                    round);
1879     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1880                                    scaled_bottom_left_y, scaled_bottom_left_y,
1881                                    round);
1882     dst += stride;
1883   }
1884   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1885     const __m128i y_select = _mm_set1_epi32(y_mask);
1886     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1887     const __m128i scaled_bottom_left_y =
1888         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1889     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1890                                    scaled_bottom_left_y, scaled_bottom_left_y,
1891                                    round);
1892     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1893                                    scaled_bottom_left_y, scaled_bottom_left_y,
1894                                    round);
1895     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1896                                    scaled_bottom_left_y, scaled_bottom_left_y,
1897                                    round);
1898     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1899                                    scaled_bottom_left_y, scaled_bottom_left_y,
1900                                    round);
1901     dst += stride;
1902   }
1903 }
1904 
aom_smooth_v_predictor_64x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)1905 void aom_smooth_v_predictor_64x32_ssse3(
1906     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
1907     const uint8_t *LIBAOM_RESTRICT top_row,
1908     const uint8_t *LIBAOM_RESTRICT left_column) {
1909   const __m128i zero = _mm_setzero_si128();
1910   const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
1911   const __m128i top_lolo = LoadUnaligned16(top_row);
1912   const __m128i top_lohi = LoadUnaligned16(top_row + 16);
1913   const __m128i top1 = cvtepu8_epi16(top_lolo);
1914   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
1915   const __m128i top3 = cvtepu8_epi16(top_lohi);
1916   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
1917   const __m128i top_hilo = LoadUnaligned16(top_row + 32);
1918   const __m128i top_hihi = LoadUnaligned16(top_row + 48);
1919   const __m128i top5 = cvtepu8_epi16(top_hilo);
1920   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
1921   const __m128i top7 = cvtepu8_epi16(top_hihi);
1922   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
1923   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
1924   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
1925   const __m128i weights1 = cvtepu8_epi16(weights_lo);
1926   const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
1927   const __m128i weights3 = cvtepu8_epi16(weights_hi);
1928   const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
1929   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
1930   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
1931   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
1932   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
1933   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
1934   const __m128i scaled_bottom_left1 =
1935       _mm_mullo_epi16(inverted_weights1, bottom_left);
1936   const __m128i scaled_bottom_left2 =
1937       _mm_mullo_epi16(inverted_weights2, bottom_left);
1938   const __m128i scaled_bottom_left3 =
1939       _mm_mullo_epi16(inverted_weights3, bottom_left);
1940   const __m128i scaled_bottom_left4 =
1941       _mm_mullo_epi16(inverted_weights4, bottom_left);
1942   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
1943 
1944   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1945     const __m128i y_select = _mm_set1_epi32(y_mask);
1946     const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
1947     const __m128i scaled_bottom_left_y =
1948         _mm_shuffle_epi8(scaled_bottom_left1, y_select);
1949     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1950                                    scaled_bottom_left_y, scaled_bottom_left_y,
1951                                    round);
1952     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1953                                    scaled_bottom_left_y, scaled_bottom_left_y,
1954                                    round);
1955     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1956                                    scaled_bottom_left_y, scaled_bottom_left_y,
1957                                    round);
1958     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1959                                    scaled_bottom_left_y, scaled_bottom_left_y,
1960                                    round);
1961     dst += stride;
1962   }
1963   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1964     const __m128i y_select = _mm_set1_epi32(y_mask);
1965     const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
1966     const __m128i scaled_bottom_left_y =
1967         _mm_shuffle_epi8(scaled_bottom_left2, y_select);
1968     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1969                                    scaled_bottom_left_y, scaled_bottom_left_y,
1970                                    round);
1971     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1972                                    scaled_bottom_left_y, scaled_bottom_left_y,
1973                                    round);
1974     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1975                                    scaled_bottom_left_y, scaled_bottom_left_y,
1976                                    round);
1977     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1978                                    scaled_bottom_left_y, scaled_bottom_left_y,
1979                                    round);
1980     dst += stride;
1981   }
1982   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
1983     const __m128i y_select = _mm_set1_epi32(y_mask);
1984     const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
1985     const __m128i scaled_bottom_left_y =
1986         _mm_shuffle_epi8(scaled_bottom_left3, y_select);
1987     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
1988                                    scaled_bottom_left_y, scaled_bottom_left_y,
1989                                    round);
1990     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
1991                                    scaled_bottom_left_y, scaled_bottom_left_y,
1992                                    round);
1993     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
1994                                    scaled_bottom_left_y, scaled_bottom_left_y,
1995                                    round);
1996     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
1997                                    scaled_bottom_left_y, scaled_bottom_left_y,
1998                                    round);
1999     dst += stride;
2000   }
2001   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2002     const __m128i y_select = _mm_set1_epi32(y_mask);
2003     const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
2004     const __m128i scaled_bottom_left_y =
2005         _mm_shuffle_epi8(scaled_bottom_left4, y_select);
2006     write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2007                                    scaled_bottom_left_y, scaled_bottom_left_y,
2008                                    round);
2009     write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2010                                    scaled_bottom_left_y, scaled_bottom_left_y,
2011                                    round);
2012     write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2013                                    scaled_bottom_left_y, scaled_bottom_left_y,
2014                                    round);
2015     write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2016                                    scaled_bottom_left_y, scaled_bottom_left_y,
2017                                    round);
2018     dst += stride;
2019   }
2020 }
2021 
aom_smooth_v_predictor_64x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2022 void aom_smooth_v_predictor_64x64_ssse3(
2023     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2024     const uint8_t *LIBAOM_RESTRICT top_row,
2025     const uint8_t *LIBAOM_RESTRICT left_column) {
2026   const __m128i zero = _mm_setzero_si128();
2027   const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
2028   const __m128i top_lolo = LoadUnaligned16(top_row);
2029   const __m128i top_lohi = LoadUnaligned16(top_row + 16);
2030   const __m128i top1 = cvtepu8_epi16(top_lolo);
2031   const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
2032   const __m128i top3 = cvtepu8_epi16(top_lohi);
2033   const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
2034   const __m128i top_hilo = LoadUnaligned16(top_row + 32);
2035   const __m128i top_hihi = LoadUnaligned16(top_row + 48);
2036   const __m128i top5 = cvtepu8_epi16(top_hilo);
2037   const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
2038   const __m128i top7 = cvtepu8_epi16(top_hihi);
2039   const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
2040   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2041   const __m128i round = _mm_set1_epi16(128);
2042   const uint8_t *weights_base_ptr = smooth_weights + 60;
2043   for (int left_offset = 0; left_offset < 64; left_offset += 16) {
2044     const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
2045     const __m128i weights_lo = cvtepu8_epi16(weights);
2046     const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
2047     const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
2048     const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
2049     const __m128i scaled_bottom_left_lo =
2050         _mm_mullo_epi16(inverted_weights_lo, bottom_left);
2051     const __m128i scaled_bottom_left_hi =
2052         _mm_mullo_epi16(inverted_weights_hi, bottom_left);
2053     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2054       const __m128i y_select = _mm_set1_epi32(y_mask);
2055       const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
2056       const __m128i scaled_bottom_left_y =
2057           _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
2058       write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2059                                      scaled_bottom_left_y, scaled_bottom_left_y,
2060                                      round);
2061       write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2062                                      scaled_bottom_left_y, scaled_bottom_left_y,
2063                                      round);
2064       write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2065                                      scaled_bottom_left_y, scaled_bottom_left_y,
2066                                      round);
2067       write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2068                                      scaled_bottom_left_y, scaled_bottom_left_y,
2069                                      round);
2070       dst += stride;
2071     }
2072     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2073       const __m128i y_select = _mm_set1_epi32(y_mask);
2074       const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
2075       const __m128i scaled_bottom_left_y =
2076           _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
2077       write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
2078                                      scaled_bottom_left_y, scaled_bottom_left_y,
2079                                      round);
2080       write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
2081                                      scaled_bottom_left_y, scaled_bottom_left_y,
2082                                      round);
2083       write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
2084                                      scaled_bottom_left_y, scaled_bottom_left_y,
2085                                      round);
2086       write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
2087                                      scaled_bottom_left_y, scaled_bottom_left_y,
2088                                      round);
2089       dst += stride;
2090     }
2091   }
2092 }
2093 
2094 // -----------------------------------------------------------------------------
2095 // SMOOTH_H_PRED
write_smooth_horizontal_sum4(uint8_t * LIBAOM_RESTRICT dst,const __m128i * left_y,const __m128i * weights,const __m128i * scaled_top_right,const __m128i * round)2096 static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
2097     uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
2098     const __m128i *scaled_top_right, const __m128i *round) {
2099   const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
2100   const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
2101   // Equivalent to RightShiftWithRounding(pred[x][y], 8).
2102   const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
2103   const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
2104   Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
2105 }
2106 
aom_smooth_h_predictor_4x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2107 void aom_smooth_h_predictor_4x4_ssse3(
2108     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2109     const uint8_t *LIBAOM_RESTRICT top_row,
2110     const uint8_t *LIBAOM_RESTRICT left_column) {
2111   const __m128i top_right = _mm_set1_epi32(top_row[3]);
2112   const __m128i left = cvtepu8_epi32(Load4(left_column));
2113   const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2114   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2115   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2116   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2117   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2118   __m128i left_y = _mm_shuffle_epi32(left, 0);
2119   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2120                                &round);
2121   dst += stride;
2122   left_y = _mm_shuffle_epi32(left, 0x55);
2123   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2124                                &round);
2125   dst += stride;
2126   left_y = _mm_shuffle_epi32(left, 0xaa);
2127   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2128                                &round);
2129   dst += stride;
2130   left_y = _mm_shuffle_epi32(left, 0xff);
2131   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2132                                &round);
2133 }
2134 
aom_smooth_h_predictor_4x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2135 void aom_smooth_h_predictor_4x8_ssse3(
2136     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2137     const uint8_t *LIBAOM_RESTRICT top_row,
2138     const uint8_t *LIBAOM_RESTRICT left_column) {
2139   const __m128i top_right = _mm_set1_epi32(top_row[3]);
2140   const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2141   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2142   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2143   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2144   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2145   __m128i left = cvtepu8_epi32(Load4(left_column));
2146   __m128i left_y = _mm_shuffle_epi32(left, 0);
2147   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2148                                &round);
2149   dst += stride;
2150   left_y = _mm_shuffle_epi32(left, 0x55);
2151   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2152                                &round);
2153   dst += stride;
2154   left_y = _mm_shuffle_epi32(left, 0xaa);
2155   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2156                                &round);
2157   dst += stride;
2158   left_y = _mm_shuffle_epi32(left, 0xff);
2159   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2160                                &round);
2161   dst += stride;
2162 
2163   left = cvtepu8_epi32(Load4(left_column + 4));
2164   left_y = _mm_shuffle_epi32(left, 0);
2165   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2166                                &round);
2167   dst += stride;
2168   left_y = _mm_shuffle_epi32(left, 0x55);
2169   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2170                                &round);
2171   dst += stride;
2172   left_y = _mm_shuffle_epi32(left, 0xaa);
2173   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2174                                &round);
2175   dst += stride;
2176   left_y = _mm_shuffle_epi32(left, 0xff);
2177   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2178                                &round);
2179 }
2180 
aom_smooth_h_predictor_4x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2181 void aom_smooth_h_predictor_4x16_ssse3(
2182     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2183     const uint8_t *LIBAOM_RESTRICT top_row,
2184     const uint8_t *LIBAOM_RESTRICT left_column) {
2185   const __m128i top_right = _mm_set1_epi32(top_row[3]);
2186   const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
2187   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2188   const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
2189   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2190   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2191   __m128i left = cvtepu8_epi32(Load4(left_column));
2192   __m128i left_y = _mm_shuffle_epi32(left, 0);
2193   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2194                                &round);
2195   dst += stride;
2196   left_y = _mm_shuffle_epi32(left, 0x55);
2197   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2198                                &round);
2199   dst += stride;
2200   left_y = _mm_shuffle_epi32(left, 0xaa);
2201   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2202                                &round);
2203   dst += stride;
2204   left_y = _mm_shuffle_epi32(left, 0xff);
2205   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2206                                &round);
2207   dst += stride;
2208 
2209   left = cvtepu8_epi32(Load4(left_column + 4));
2210   left_y = _mm_shuffle_epi32(left, 0);
2211   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2212                                &round);
2213   dst += stride;
2214   left_y = _mm_shuffle_epi32(left, 0x55);
2215   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2216                                &round);
2217   dst += stride;
2218   left_y = _mm_shuffle_epi32(left, 0xaa);
2219   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2220                                &round);
2221   dst += stride;
2222   left_y = _mm_shuffle_epi32(left, 0xff);
2223   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2224                                &round);
2225   dst += stride;
2226 
2227   left = cvtepu8_epi32(Load4(left_column + 8));
2228   left_y = _mm_shuffle_epi32(left, 0);
2229   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2230                                &round);
2231   dst += stride;
2232   left_y = _mm_shuffle_epi32(left, 0x55);
2233   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2234                                &round);
2235   dst += stride;
2236   left_y = _mm_shuffle_epi32(left, 0xaa);
2237   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2238                                &round);
2239   dst += stride;
2240   left_y = _mm_shuffle_epi32(left, 0xff);
2241   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2242                                &round);
2243   dst += stride;
2244 
2245   left = cvtepu8_epi32(Load4(left_column + 12));
2246   left_y = _mm_shuffle_epi32(left, 0);
2247   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2248                                &round);
2249   dst += stride;
2250   left_y = _mm_shuffle_epi32(left, 0x55);
2251   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2252                                &round);
2253   dst += stride;
2254   left_y = _mm_shuffle_epi32(left, 0xaa);
2255   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2256                                &round);
2257   dst += stride;
2258   left_y = _mm_shuffle_epi32(left, 0xff);
2259   write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
2260                                &round);
2261 }
2262 
2263 // For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
2264 // |pixels| is a segment of the top row or the whole top row, and |weights| is
2265 // repeated.
aom_smooth_h_predictor_8x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2266 void aom_smooth_h_predictor_8x4_ssse3(
2267     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2268     const uint8_t *LIBAOM_RESTRICT top_row,
2269     const uint8_t *LIBAOM_RESTRICT left_column) {
2270   const __m128i top_right = _mm_set1_epi16(top_row[7]);
2271   const __m128i left = cvtepu8_epi16(Load4(left_column));
2272   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2273   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2274   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2275   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2276   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2277   __m128i y_select = _mm_set1_epi32(0x01000100);
2278   __m128i left_y = _mm_shuffle_epi8(left, y_select);
2279   write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2280                                 &round);
2281   dst += stride;
2282   y_select = _mm_set1_epi32(0x03020302);
2283   left_y = _mm_shuffle_epi8(left, y_select);
2284   write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2285                                 &round);
2286   dst += stride;
2287   y_select = _mm_set1_epi32(0x05040504);
2288   left_y = _mm_shuffle_epi8(left, y_select);
2289   write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2290                                 &round);
2291   dst += stride;
2292   y_select = _mm_set1_epi32(0x07060706);
2293   left_y = _mm_shuffle_epi8(left, y_select);
2294   write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2295                                 &round);
2296 }
2297 
aom_smooth_h_predictor_8x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2298 void aom_smooth_h_predictor_8x8_ssse3(
2299     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2300     const uint8_t *LIBAOM_RESTRICT top_row,
2301     const uint8_t *LIBAOM_RESTRICT left_column) {
2302   const __m128i top_right = _mm_set1_epi16(top_row[7]);
2303   const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2304   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2305   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2306   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2307   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2308   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2309   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2310     const __m128i y_select = _mm_set1_epi32(y_mask);
2311     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2312     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2313                                   &round);
2314     dst += stride;
2315   }
2316 }
2317 
aom_smooth_h_predictor_8x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2318 void aom_smooth_h_predictor_8x16_ssse3(
2319     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2320     const uint8_t *LIBAOM_RESTRICT top_row,
2321     const uint8_t *LIBAOM_RESTRICT left_column) {
2322   const __m128i top_right = _mm_set1_epi16(top_row[7]);
2323   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2324   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2325   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2326   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2327   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2328   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2329   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2330     const __m128i y_select = _mm_set1_epi32(y_mask);
2331     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2332     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2333                                   &round);
2334     dst += stride;
2335   }
2336   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2337   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2338     const __m128i y_select = _mm_set1_epi32(y_mask);
2339     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2340     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2341                                   &round);
2342     dst += stride;
2343   }
2344 }
2345 
aom_smooth_h_predictor_8x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2346 void aom_smooth_h_predictor_8x32_ssse3(
2347     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2348     const uint8_t *LIBAOM_RESTRICT top_row,
2349     const uint8_t *LIBAOM_RESTRICT left_column) {
2350   const __m128i top_right = _mm_set1_epi16(top_row[7]);
2351   const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
2352   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2353   const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
2354   const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
2355   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2356   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2357   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2358     const __m128i y_select = _mm_set1_epi32(y_mask);
2359     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2360     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2361                                   &round);
2362     dst += stride;
2363   }
2364   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2365   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2366     const __m128i y_select = _mm_set1_epi32(y_mask);
2367     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2368     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2369                                   &round);
2370     dst += stride;
2371   }
2372   left = cvtepu8_epi16(LoadLo8(left_column + 16));
2373   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2374     const __m128i y_select = _mm_set1_epi32(y_mask);
2375     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2376     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2377                                   &round);
2378     dst += stride;
2379   }
2380   left = cvtepu8_epi16(LoadLo8(left_column + 24));
2381   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2382     const __m128i y_select = _mm_set1_epi32(y_mask);
2383     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2384     write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
2385                                   &round);
2386     dst += stride;
2387   }
2388 }
2389 
aom_smooth_h_predictor_16x4_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2390 void aom_smooth_h_predictor_16x4_ssse3(
2391     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2392     const uint8_t *LIBAOM_RESTRICT top_row,
2393     const uint8_t *LIBAOM_RESTRICT left_column) {
2394   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2395   const __m128i left = cvtepu8_epi16(Load4(left_column));
2396   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2397   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2398   const __m128i weights1 = cvtepu8_epi16(weights);
2399   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2400   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2401   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2402   const __m128i scaled_top_right1 =
2403       _mm_mullo_epi16(inverted_weights1, top_right);
2404   const __m128i scaled_top_right2 =
2405       _mm_mullo_epi16(inverted_weights2, top_right);
2406   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2407   __m128i y_mask = _mm_set1_epi32(0x01000100);
2408   __m128i left_y = _mm_shuffle_epi8(left, y_mask);
2409   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2410                                  scaled_top_right1, scaled_top_right2, round);
2411   dst += stride;
2412   y_mask = _mm_set1_epi32(0x03020302);
2413   left_y = _mm_shuffle_epi8(left, y_mask);
2414   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2415                                  scaled_top_right1, scaled_top_right2, round);
2416   dst += stride;
2417   y_mask = _mm_set1_epi32(0x05040504);
2418   left_y = _mm_shuffle_epi8(left, y_mask);
2419   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2420                                  scaled_top_right1, scaled_top_right2, round);
2421   dst += stride;
2422   y_mask = _mm_set1_epi32(0x07060706);
2423   left_y = _mm_shuffle_epi8(left, y_mask);
2424   write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2425                                  scaled_top_right1, scaled_top_right2, round);
2426 }
2427 
aom_smooth_h_predictor_16x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2428 void aom_smooth_h_predictor_16x8_ssse3(
2429     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2430     const uint8_t *LIBAOM_RESTRICT top_row,
2431     const uint8_t *LIBAOM_RESTRICT left_column) {
2432   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2433   const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2434   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2435   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2436   const __m128i weights1 = cvtepu8_epi16(weights);
2437   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2438   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2439   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2440   const __m128i scaled_top_right1 =
2441       _mm_mullo_epi16(inverted_weights1, top_right);
2442   const __m128i scaled_top_right2 =
2443       _mm_mullo_epi16(inverted_weights2, top_right);
2444   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2445   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2446     const __m128i y_select = _mm_set1_epi32(y_mask);
2447     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2448     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2449                                    scaled_top_right1, scaled_top_right2, round);
2450     dst += stride;
2451   }
2452 }
2453 
aom_smooth_h_predictor_16x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2454 void aom_smooth_h_predictor_16x16_ssse3(
2455     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2456     const uint8_t *LIBAOM_RESTRICT top_row,
2457     const uint8_t *LIBAOM_RESTRICT left_column) {
2458   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2459   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2460   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2461   const __m128i weights1 = cvtepu8_epi16(weights);
2462   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2463   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2464   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2465   const __m128i scaled_top_right1 =
2466       _mm_mullo_epi16(inverted_weights1, top_right);
2467   const __m128i scaled_top_right2 =
2468       _mm_mullo_epi16(inverted_weights2, top_right);
2469   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2470   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2471   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2472     const __m128i y_select = _mm_set1_epi32(y_mask);
2473     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2474     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2475                                    scaled_top_right1, scaled_top_right2, round);
2476     dst += stride;
2477   }
2478   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2479   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2480     const __m128i y_select = _mm_set1_epi32(y_mask);
2481     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2482     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2483                                    scaled_top_right1, scaled_top_right2, round);
2484     dst += stride;
2485   }
2486 }
2487 
aom_smooth_h_predictor_16x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2488 void aom_smooth_h_predictor_16x32_ssse3(
2489     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2490     const uint8_t *LIBAOM_RESTRICT top_row,
2491     const uint8_t *LIBAOM_RESTRICT left_column) {
2492   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2493   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2494   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2495   const __m128i weights1 = cvtepu8_epi16(weights);
2496   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2497   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2498   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2499   const __m128i scaled_top_right1 =
2500       _mm_mullo_epi16(inverted_weights1, top_right);
2501   const __m128i scaled_top_right2 =
2502       _mm_mullo_epi16(inverted_weights2, top_right);
2503   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2504   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2505   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2506     const __m128i y_select = _mm_set1_epi32(y_mask);
2507     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2508     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2509                                    scaled_top_right1, scaled_top_right2, round);
2510     dst += stride;
2511   }
2512   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2513   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2514     const __m128i y_select = _mm_set1_epi32(y_mask);
2515     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2516     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2517                                    scaled_top_right1, scaled_top_right2, round);
2518     dst += stride;
2519   }
2520   left = cvtepu8_epi16(LoadLo8(left_column + 16));
2521   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2522     const __m128i y_select = _mm_set1_epi32(y_mask);
2523     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2524     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2525                                    scaled_top_right1, scaled_top_right2, round);
2526     dst += stride;
2527   }
2528   left = cvtepu8_epi16(LoadLo8(left_column + 24));
2529   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2530     const __m128i y_select = _mm_set1_epi32(y_mask);
2531     const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2532     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2533                                    scaled_top_right1, scaled_top_right2, round);
2534     dst += stride;
2535   }
2536 }
2537 
aom_smooth_h_predictor_16x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2538 void aom_smooth_h_predictor_16x64_ssse3(
2539     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2540     const uint8_t *LIBAOM_RESTRICT top_row,
2541     const uint8_t *LIBAOM_RESTRICT left_column) {
2542   const __m128i top_right = _mm_set1_epi16(top_row[15]);
2543   const __m128i weights = LoadUnaligned16(smooth_weights + 12);
2544   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2545   const __m128i weights1 = cvtepu8_epi16(weights);
2546   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
2547   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2548   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2549   const __m128i scaled_top_right1 =
2550       _mm_mullo_epi16(inverted_weights1, top_right);
2551   const __m128i scaled_top_right2 =
2552       _mm_mullo_epi16(inverted_weights2, top_right);
2553   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2554   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2555     const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2556     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2557       const __m128i y_select = _mm_set1_epi32(y_mask);
2558       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2559       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2560                                      scaled_top_right1, scaled_top_right2,
2561                                      round);
2562       dst += stride;
2563     }
2564   }
2565 }
2566 
aom_smooth_h_predictor_32x8_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2567 void aom_smooth_h_predictor_32x8_ssse3(
2568     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2569     const uint8_t *LIBAOM_RESTRICT top_row,
2570     const uint8_t *LIBAOM_RESTRICT left_column) {
2571   const __m128i top_right = _mm_set1_epi16(top_row[31]);
2572   const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2573   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2574   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2575   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2576   const __m128i weights1 = cvtepu8_epi16(weights_lo);
2577   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2578   const __m128i weights3 = cvtepu8_epi16(weights_hi);
2579   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2580   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2581   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2582   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2583   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2584   const __m128i scaled_top_right1 =
2585       _mm_mullo_epi16(inverted_weights1, top_right);
2586   const __m128i scaled_top_right2 =
2587       _mm_mullo_epi16(inverted_weights2, top_right);
2588   const __m128i scaled_top_right3 =
2589       _mm_mullo_epi16(inverted_weights3, top_right);
2590   const __m128i scaled_top_right4 =
2591       _mm_mullo_epi16(inverted_weights4, top_right);
2592   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2593   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2594     __m128i y_select = _mm_set1_epi32(y_mask);
2595     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2596     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2597                                    scaled_top_right1, scaled_top_right2, round);
2598     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2599                                    scaled_top_right3, scaled_top_right4, round);
2600     dst += stride;
2601   }
2602 }
2603 
aom_smooth_h_predictor_32x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2604 void aom_smooth_h_predictor_32x16_ssse3(
2605     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2606     const uint8_t *LIBAOM_RESTRICT top_row,
2607     const uint8_t *LIBAOM_RESTRICT left_column) {
2608   const __m128i top_right = _mm_set1_epi16(top_row[31]);
2609   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2610   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2611   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2612   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2613   const __m128i weights1 = cvtepu8_epi16(weights_lo);
2614   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2615   const __m128i weights3 = cvtepu8_epi16(weights_hi);
2616   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2617   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2618   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2619   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2620   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2621   const __m128i scaled_top_right1 =
2622       _mm_mullo_epi16(inverted_weights1, top_right);
2623   const __m128i scaled_top_right2 =
2624       _mm_mullo_epi16(inverted_weights2, top_right);
2625   const __m128i scaled_top_right3 =
2626       _mm_mullo_epi16(inverted_weights3, top_right);
2627   const __m128i scaled_top_right4 =
2628       _mm_mullo_epi16(inverted_weights4, top_right);
2629   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2630   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2631     __m128i y_select = _mm_set1_epi32(y_mask);
2632     __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2633     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2634                                    scaled_top_right1, scaled_top_right2, round);
2635     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2636                                    scaled_top_right3, scaled_top_right4, round);
2637     dst += stride;
2638   }
2639   const __m128i left2 =
2640       cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
2641   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2642     __m128i y_select = _mm_set1_epi32(y_mask);
2643     __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2644     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2645                                    scaled_top_right1, scaled_top_right2, round);
2646     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2647                                    scaled_top_right3, scaled_top_right4, round);
2648     dst += stride;
2649   }
2650 }
2651 
aom_smooth_h_predictor_32x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2652 void aom_smooth_h_predictor_32x32_ssse3(
2653     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2654     const uint8_t *LIBAOM_RESTRICT top_row,
2655     const uint8_t *LIBAOM_RESTRICT left_column) {
2656   const __m128i top_right = _mm_set1_epi16(top_row[31]);
2657   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2658   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2659   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2660   const __m128i weights1 = cvtepu8_epi16(weights_lo);
2661   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2662   const __m128i weights3 = cvtepu8_epi16(weights_hi);
2663   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2664   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2665   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2666   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2667   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2668   const __m128i scaled_top_right1 =
2669       _mm_mullo_epi16(inverted_weights1, top_right);
2670   const __m128i scaled_top_right2 =
2671       _mm_mullo_epi16(inverted_weights2, top_right);
2672   const __m128i scaled_top_right3 =
2673       _mm_mullo_epi16(inverted_weights3, top_right);
2674   const __m128i scaled_top_right4 =
2675       _mm_mullo_epi16(inverted_weights4, top_right);
2676   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2677   __m128i left = cvtepu8_epi16(LoadLo8(left_column));
2678   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2679     __m128i y_select = _mm_set1_epi32(y_mask);
2680     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2681     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2682                                    scaled_top_right1, scaled_top_right2, round);
2683     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2684                                    scaled_top_right3, scaled_top_right4, round);
2685     dst += stride;
2686   }
2687   left = cvtepu8_epi16(LoadLo8(left_column + 8));
2688   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2689     __m128i y_select = _mm_set1_epi32(y_mask);
2690     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2691     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2692                                    scaled_top_right1, scaled_top_right2, round);
2693     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2694                                    scaled_top_right3, scaled_top_right4, round);
2695     dst += stride;
2696   }
2697   left = cvtepu8_epi16(LoadLo8(left_column + 16));
2698   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2699     __m128i y_select = _mm_set1_epi32(y_mask);
2700     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2701     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2702                                    scaled_top_right1, scaled_top_right2, round);
2703     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2704                                    scaled_top_right3, scaled_top_right4, round);
2705     dst += stride;
2706   }
2707   left = cvtepu8_epi16(LoadLo8(left_column + 24));
2708   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2709     __m128i y_select = _mm_set1_epi32(y_mask);
2710     __m128i left_y = _mm_shuffle_epi8(left, y_select);
2711     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2712                                    scaled_top_right1, scaled_top_right2, round);
2713     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2714                                    scaled_top_right3, scaled_top_right4, round);
2715     dst += stride;
2716   }
2717 }
2718 
aom_smooth_h_predictor_32x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2719 void aom_smooth_h_predictor_32x64_ssse3(
2720     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2721     const uint8_t *LIBAOM_RESTRICT top_row,
2722     const uint8_t *LIBAOM_RESTRICT left_column) {
2723   const __m128i top_right = _mm_set1_epi16(top_row[31]);
2724   const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
2725   const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
2726   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2727   const __m128i weights1 = cvtepu8_epi16(weights_lo);
2728   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
2729   const __m128i weights3 = cvtepu8_epi16(weights_hi);
2730   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
2731   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2732   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2733   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2734   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2735   const __m128i scaled_top_right1 =
2736       _mm_mullo_epi16(inverted_weights1, top_right);
2737   const __m128i scaled_top_right2 =
2738       _mm_mullo_epi16(inverted_weights2, top_right);
2739   const __m128i scaled_top_right3 =
2740       _mm_mullo_epi16(inverted_weights3, top_right);
2741   const __m128i scaled_top_right4 =
2742       _mm_mullo_epi16(inverted_weights4, top_right);
2743   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2744   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2745     const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2746     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2747       const __m128i y_select = _mm_set1_epi32(y_mask);
2748       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2749       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2750                                      scaled_top_right1, scaled_top_right2,
2751                                      round);
2752       write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2753                                      weights4, scaled_top_right3,
2754                                      scaled_top_right4, round);
2755       dst += stride;
2756     }
2757   }
2758 }
2759 
aom_smooth_h_predictor_64x16_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2760 void aom_smooth_h_predictor_64x16_ssse3(
2761     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2762     const uint8_t *LIBAOM_RESTRICT top_row,
2763     const uint8_t *LIBAOM_RESTRICT left_column) {
2764   const __m128i top_right = _mm_set1_epi16(top_row[63]);
2765   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2766   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2767   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2768   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2769   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2770   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2771   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2772   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2773   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2774   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2775   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2776   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2777   const __m128i scaled_top_right1 =
2778       _mm_mullo_epi16(inverted_weights1, top_right);
2779   const __m128i scaled_top_right2 =
2780       _mm_mullo_epi16(inverted_weights2, top_right);
2781   const __m128i scaled_top_right3 =
2782       _mm_mullo_epi16(inverted_weights3, top_right);
2783   const __m128i scaled_top_right4 =
2784       _mm_mullo_epi16(inverted_weights4, top_right);
2785   const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2786   const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2787   const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2788   const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2789   const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2790   const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2791   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2792   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2793   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2794   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2795   const __m128i scaled_top_right5 =
2796       _mm_mullo_epi16(inverted_weights5, top_right);
2797   const __m128i scaled_top_right6 =
2798       _mm_mullo_epi16(inverted_weights6, top_right);
2799   const __m128i scaled_top_right7 =
2800       _mm_mullo_epi16(inverted_weights7, top_right);
2801   const __m128i scaled_top_right8 =
2802       _mm_mullo_epi16(inverted_weights8, top_right);
2803   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2804   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2805     __m128i y_select = _mm_set1_epi32(y_mask);
2806     __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2807     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2808                                    scaled_top_right1, scaled_top_right2, round);
2809     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2810                                    scaled_top_right3, scaled_top_right4, round);
2811     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2812                                    scaled_top_right5, scaled_top_right6, round);
2813     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2814                                    scaled_top_right7, scaled_top_right8, round);
2815     dst += stride;
2816   }
2817   const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2818   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2819     __m128i y_select = _mm_set1_epi32(y_mask);
2820     __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2821     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2822                                    scaled_top_right1, scaled_top_right2, round);
2823     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2824                                    scaled_top_right3, scaled_top_right4, round);
2825     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2826                                    scaled_top_right5, scaled_top_right6, round);
2827     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2828                                    scaled_top_right7, scaled_top_right8, round);
2829     dst += stride;
2830   }
2831 }
2832 
aom_smooth_h_predictor_64x32_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2833 void aom_smooth_h_predictor_64x32_ssse3(
2834     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2835     const uint8_t *LIBAOM_RESTRICT top_row,
2836     const uint8_t *LIBAOM_RESTRICT left_column) {
2837   const __m128i top_right = _mm_set1_epi16(top_row[63]);
2838   const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
2839   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2840   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2841   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2842   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2843   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2844   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2845   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2846   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2847   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2848   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2849   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2850   const __m128i scaled_top_right1 =
2851       _mm_mullo_epi16(inverted_weights1, top_right);
2852   const __m128i scaled_top_right2 =
2853       _mm_mullo_epi16(inverted_weights2, top_right);
2854   const __m128i scaled_top_right3 =
2855       _mm_mullo_epi16(inverted_weights3, top_right);
2856   const __m128i scaled_top_right4 =
2857       _mm_mullo_epi16(inverted_weights4, top_right);
2858   const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2859   const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2860   const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2861   const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2862   const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2863   const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2864   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2865   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2866   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2867   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2868   const __m128i scaled_top_right5 =
2869       _mm_mullo_epi16(inverted_weights5, top_right);
2870   const __m128i scaled_top_right6 =
2871       _mm_mullo_epi16(inverted_weights6, top_right);
2872   const __m128i scaled_top_right7 =
2873       _mm_mullo_epi16(inverted_weights7, top_right);
2874   const __m128i scaled_top_right8 =
2875       _mm_mullo_epi16(inverted_weights8, top_right);
2876   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2877   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2878     const __m128i y_select = _mm_set1_epi32(y_mask);
2879     const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
2880     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2881                                    scaled_top_right1, scaled_top_right2, round);
2882     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2883                                    scaled_top_right3, scaled_top_right4, round);
2884     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2885                                    scaled_top_right5, scaled_top_right6, round);
2886     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2887                                    scaled_top_right7, scaled_top_right8, round);
2888     dst += stride;
2889   }
2890   const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
2891   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2892     const __m128i y_select = _mm_set1_epi32(y_mask);
2893     const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
2894     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2895                                    scaled_top_right1, scaled_top_right2, round);
2896     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2897                                    scaled_top_right3, scaled_top_right4, round);
2898     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2899                                    scaled_top_right5, scaled_top_right6, round);
2900     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2901                                    scaled_top_right7, scaled_top_right8, round);
2902     dst += stride;
2903   }
2904   const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
2905   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2906     const __m128i y_select = _mm_set1_epi32(y_mask);
2907     const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
2908     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2909                                    scaled_top_right1, scaled_top_right2, round);
2910     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2911                                    scaled_top_right3, scaled_top_right4, round);
2912     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2913                                    scaled_top_right5, scaled_top_right6, round);
2914     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2915                                    scaled_top_right7, scaled_top_right8, round);
2916     dst += stride;
2917   }
2918   const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
2919   for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2920     const __m128i y_select = _mm_set1_epi32(y_mask);
2921     const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
2922     write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2923                                    scaled_top_right1, scaled_top_right2, round);
2924     write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
2925                                    scaled_top_right3, scaled_top_right4, round);
2926     write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
2927                                    scaled_top_right5, scaled_top_right6, round);
2928     write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
2929                                    scaled_top_right7, scaled_top_right8, round);
2930     dst += stride;
2931   }
2932 }
2933 
aom_smooth_h_predictor_64x64_ssse3(uint8_t * LIBAOM_RESTRICT dst,ptrdiff_t stride,const uint8_t * LIBAOM_RESTRICT top_row,const uint8_t * LIBAOM_RESTRICT left_column)2934 void aom_smooth_h_predictor_64x64_ssse3(
2935     uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
2936     const uint8_t *LIBAOM_RESTRICT top_row,
2937     const uint8_t *LIBAOM_RESTRICT left_column) {
2938   const __m128i top_right = _mm_set1_epi16(top_row[63]);
2939   const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
2940   const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
2941   const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
2942   const __m128i weights1 = cvtepu8_epi16(weights_lolo);
2943   const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
2944   const __m128i weights3 = cvtepu8_epi16(weights_lohi);
2945   const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
2946   const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
2947   const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
2948   const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
2949   const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
2950   const __m128i scaled_top_right1 =
2951       _mm_mullo_epi16(inverted_weights1, top_right);
2952   const __m128i scaled_top_right2 =
2953       _mm_mullo_epi16(inverted_weights2, top_right);
2954   const __m128i scaled_top_right3 =
2955       _mm_mullo_epi16(inverted_weights3, top_right);
2956   const __m128i scaled_top_right4 =
2957       _mm_mullo_epi16(inverted_weights4, top_right);
2958   const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
2959   const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
2960   const __m128i weights5 = cvtepu8_epi16(weights_hilo);
2961   const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
2962   const __m128i weights7 = cvtepu8_epi16(weights_hihi);
2963   const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
2964   const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
2965   const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
2966   const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
2967   const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
2968   const __m128i scaled_top_right5 =
2969       _mm_mullo_epi16(inverted_weights5, top_right);
2970   const __m128i scaled_top_right6 =
2971       _mm_mullo_epi16(inverted_weights6, top_right);
2972   const __m128i scaled_top_right7 =
2973       _mm_mullo_epi16(inverted_weights7, top_right);
2974   const __m128i scaled_top_right8 =
2975       _mm_mullo_epi16(inverted_weights8, top_right);
2976   const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
2977   for (int left_offset = 0; left_offset < 64; left_offset += 8) {
2978     const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
2979     for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
2980       const __m128i y_select = _mm_set1_epi32(y_mask);
2981       const __m128i left_y = _mm_shuffle_epi8(left, y_select);
2982       write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
2983                                      scaled_top_right1, scaled_top_right2,
2984                                      round);
2985       write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
2986                                      weights4, scaled_top_right3,
2987                                      scaled_top_right4, round);
2988       write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
2989                                      weights6, scaled_top_right5,
2990                                      scaled_top_right6, round);
2991       write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
2992                                      weights8, scaled_top_right7,
2993                                      scaled_top_right8, round);
2994       dst += stride;
2995     }
2996   }
2997 }
2998