• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 #include "aom_dsp/x86/intrapred_x86.h"
14 #include "config/aom_dsp_rtcd.h"
15 
dc_store_4xh(uint32_t dc,int height,uint8_t * dst,ptrdiff_t stride)16 static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17                                 ptrdiff_t stride) {
18   for (int i = 0; i < height; i += 2) {
19     *(uint32_t *)dst = dc;
20     dst += stride;
21     *(uint32_t *)dst = dc;
22     dst += stride;
23   }
24 }
25 
dc_store_8xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)26 static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27                                 ptrdiff_t stride) {
28   int i;
29   for (i = 0; i < height; ++i) {
30     _mm_storel_epi64((__m128i *)dst, *row);
31     dst += stride;
32   }
33 }
34 
dc_store_16xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)35 static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36                                  ptrdiff_t stride) {
37   int i;
38   for (i = 0; i < height; ++i) {
39     _mm_store_si128((__m128i *)dst, *row);
40     dst += stride;
41   }
42 }
43 
dc_store_32xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)44 static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45                                  ptrdiff_t stride) {
46   int i;
47   for (i = 0; i < height; ++i) {
48     _mm_store_si128((__m128i *)dst, *row);
49     _mm_store_si128((__m128i *)(dst + 16), *row);
50     dst += stride;
51   }
52 }
53 
dc_store_64xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)54 static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55                                  ptrdiff_t stride) {
56   for (int i = 0; i < height; ++i) {
57     _mm_store_si128((__m128i *)dst, *row);
58     _mm_store_si128((__m128i *)(dst + 16), *row);
59     _mm_store_si128((__m128i *)(dst + 32), *row);
60     _mm_store_si128((__m128i *)(dst + 48), *row);
61     dst += stride;
62   }
63 }
64 
dc_sum_4(const uint8_t * ref)65 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
66   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67   const __m128i zero = _mm_setzero_si128();
68   x = _mm_unpacklo_epi8(x, zero);
69   return _mm_sad_epu8(x, zero);
70 }
71 
dc_sum_8(const uint8_t * ref)72 static INLINE __m128i dc_sum_8(const uint8_t *ref) {
73   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74   const __m128i zero = _mm_setzero_si128();
75   return _mm_sad_epu8(x, zero);
76 }
77 
dc_sum_64(const uint8_t * ref)78 static INLINE __m128i dc_sum_64(const uint8_t *ref) {
79   __m128i x0 = _mm_load_si128((__m128i const *)ref);
80   __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
81   __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
82   __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
83   const __m128i zero = _mm_setzero_si128();
84   x0 = _mm_sad_epu8(x0, zero);
85   x1 = _mm_sad_epu8(x1, zero);
86   x2 = _mm_sad_epu8(x2, zero);
87   x3 = _mm_sad_epu8(x3, zero);
88   x0 = _mm_add_epi16(x0, x1);
89   x2 = _mm_add_epi16(x2, x3);
90   x0 = _mm_add_epi16(x0, x2);
91   const __m128i high = _mm_unpackhi_epi64(x0, x0);
92   return _mm_add_epi16(x0, high);
93 }
94 
95 #define DC_MULTIPLIER_1X2 0x5556
96 #define DC_MULTIPLIER_1X4 0x3334
97 
98 #define DC_SHIFT2 16
99 
divide_using_multiply_shift(int num,int shift1,int multiplier)100 static INLINE int divide_using_multiply_shift(int num, int shift1,
101                                               int multiplier) {
102   const int interm = num >> shift1;
103   return interm * multiplier >> DC_SHIFT2;
104 }
105 
106 // -----------------------------------------------------------------------------
107 // DC_PRED
108 
aom_dc_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)109 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
110                                const uint8_t *above, const uint8_t *left) {
111   const __m128i sum_left = dc_sum_8(left);
112   __m128i sum_above = dc_sum_4(above);
113   sum_above = _mm_add_epi16(sum_left, sum_above);
114 
115   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
116   sum += 6;
117   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
118 
119   const __m128i row = _mm_set1_epi8((int8_t)sum);
120   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
121   dc_store_4xh(pred, 8, dst, stride);
122 }
123 
aom_dc_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)124 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
125                                 const uint8_t *above, const uint8_t *left) {
126   const __m128i sum_left = dc_sum_16_sse2(left);
127   __m128i sum_above = dc_sum_4(above);
128   sum_above = _mm_add_epi16(sum_left, sum_above);
129 
130   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
131   sum += 10;
132   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
133 
134   const __m128i row = _mm_set1_epi8((int8_t)sum);
135   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
136   dc_store_4xh(pred, 16, dst, stride);
137 }
138 
aom_dc_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)139 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
140                                const uint8_t *above, const uint8_t *left) {
141   const __m128i sum_left = dc_sum_4(left);
142   __m128i sum_above = dc_sum_8(above);
143   sum_above = _mm_add_epi16(sum_above, sum_left);
144 
145   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
146   sum += 6;
147   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
148 
149   const __m128i row = _mm_set1_epi8((int8_t)sum);
150   dc_store_8xh(&row, 4, dst, stride);
151 }
152 
aom_dc_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)153 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
154                                 const uint8_t *above, const uint8_t *left) {
155   const __m128i sum_left = dc_sum_16_sse2(left);
156   __m128i sum_above = dc_sum_8(above);
157   sum_above = _mm_add_epi16(sum_above, sum_left);
158 
159   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
160   sum += 12;
161   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
162   const __m128i row = _mm_set1_epi8((int8_t)sum);
163   dc_store_8xh(&row, 16, dst, stride);
164 }
165 
aom_dc_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)166 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
167                                 const uint8_t *above, const uint8_t *left) {
168   const __m128i sum_left = dc_sum_32_sse2(left);
169   __m128i sum_above = dc_sum_8(above);
170   sum_above = _mm_add_epi16(sum_above, sum_left);
171 
172   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
173   sum += 20;
174   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
175   const __m128i row = _mm_set1_epi8((int8_t)sum);
176   dc_store_8xh(&row, 32, dst, stride);
177 }
178 
aom_dc_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)179 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
180                                 const uint8_t *above, const uint8_t *left) {
181   const __m128i sum_left = dc_sum_4(left);
182   __m128i sum_above = dc_sum_16_sse2(above);
183   sum_above = _mm_add_epi16(sum_above, sum_left);
184 
185   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
186   sum += 10;
187   sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
188   const __m128i row = _mm_set1_epi8((int8_t)sum);
189   dc_store_16xh(&row, 4, dst, stride);
190 }
191 
aom_dc_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)192 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
193                                 const uint8_t *above, const uint8_t *left) {
194   const __m128i sum_left = dc_sum_8(left);
195   __m128i sum_above = dc_sum_16_sse2(above);
196   sum_above = _mm_add_epi16(sum_above, sum_left);
197 
198   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
199   sum += 12;
200   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
201   const __m128i row = _mm_set1_epi8((int8_t)sum);
202   dc_store_16xh(&row, 8, dst, stride);
203 }
204 
aom_dc_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)205 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
206                                  const uint8_t *above, const uint8_t *left) {
207   const __m128i sum_left = dc_sum_32_sse2(left);
208   __m128i sum_above = dc_sum_16_sse2(above);
209   sum_above = _mm_add_epi16(sum_left, sum_above);
210 
211   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
212   sum += 24;
213   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
214   const __m128i row = _mm_set1_epi8((int8_t)sum);
215   dc_store_16xh(&row, 32, dst, stride);
216 }
217 
aom_dc_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)218 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
219                                  const uint8_t *above, const uint8_t *left) {
220   const __m128i sum_left = dc_sum_64(left);
221   __m128i sum_above = dc_sum_16_sse2(above);
222   sum_above = _mm_add_epi16(sum_left, sum_above);
223 
224   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
225   sum += 40;
226   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
227   const __m128i row = _mm_set1_epi8((int8_t)sum);
228   dc_store_16xh(&row, 64, dst, stride);
229 }
230 
aom_dc_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)231 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
232                                 const uint8_t *above, const uint8_t *left) {
233   __m128i sum_above = dc_sum_32_sse2(above);
234   const __m128i sum_left = dc_sum_8(left);
235   sum_above = _mm_add_epi16(sum_above, sum_left);
236 
237   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
238   sum += 20;
239   sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
240   const __m128i row = _mm_set1_epi8((int8_t)sum);
241   dc_store_32xh(&row, 8, dst, stride);
242 }
243 
aom_dc_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)244 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
245                                  const uint8_t *above, const uint8_t *left) {
246   __m128i sum_above = dc_sum_32_sse2(above);
247   const __m128i sum_left = dc_sum_16_sse2(left);
248   sum_above = _mm_add_epi16(sum_above, sum_left);
249 
250   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
251   sum += 24;
252   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
253   const __m128i row = _mm_set1_epi8((int8_t)sum);
254   dc_store_32xh(&row, 16, dst, stride);
255 }
256 
aom_dc_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)257 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
258                                  const uint8_t *above, const uint8_t *left) {
259   __m128i sum_above = dc_sum_32_sse2(above);
260   const __m128i sum_left = dc_sum_64(left);
261   sum_above = _mm_add_epi16(sum_above, sum_left);
262 
263   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
264   sum += 48;
265   sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
266   const __m128i row = _mm_set1_epi8((int8_t)sum);
267   dc_store_32xh(&row, 64, dst, stride);
268 }
269 
aom_dc_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)270 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
271                                  const uint8_t *above, const uint8_t *left) {
272   __m128i sum_above = dc_sum_64(above);
273   const __m128i sum_left = dc_sum_64(left);
274   sum_above = _mm_add_epi16(sum_above, sum_left);
275 
276   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
277   sum += 64;
278   sum /= 128;
279   const __m128i row = _mm_set1_epi8((int8_t)sum);
280   dc_store_64xh(&row, 64, dst, stride);
281 }
282 
aom_dc_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)283 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
284                                  const uint8_t *above, const uint8_t *left) {
285   __m128i sum_above = dc_sum_64(above);
286   const __m128i sum_left = dc_sum_32_sse2(left);
287   sum_above = _mm_add_epi16(sum_above, sum_left);
288 
289   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
290   sum += 48;
291   sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
292   const __m128i row = _mm_set1_epi8((int8_t)sum);
293   dc_store_64xh(&row, 32, dst, stride);
294 }
295 
aom_dc_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)296 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
297                                  const uint8_t *above, const uint8_t *left) {
298   __m128i sum_above = dc_sum_64(above);
299   const __m128i sum_left = dc_sum_16_sse2(left);
300   sum_above = _mm_add_epi16(sum_above, sum_left);
301 
302   uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
303   sum += 40;
304   sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
305   const __m128i row = _mm_set1_epi8((int8_t)sum);
306   dc_store_64xh(&row, 16, dst, stride);
307 }
308 
309 // -----------------------------------------------------------------------------
310 // DC_TOP
311 
aom_dc_top_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)312 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
313                                    const uint8_t *above, const uint8_t *left) {
314   (void)left;
315   __m128i sum_above = dc_sum_4(above);
316   const __m128i two = _mm_set1_epi16(2);
317   sum_above = _mm_add_epi16(sum_above, two);
318   sum_above = _mm_srai_epi16(sum_above, 2);
319   sum_above = _mm_shufflelo_epi16(sum_above, 0);
320   sum_above = _mm_packus_epi16(sum_above, sum_above);
321 
322   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
323   dc_store_4xh(pred, 8, dst, stride);
324 }
325 
aom_dc_top_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)326 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
327                                     const uint8_t *above, const uint8_t *left) {
328   (void)left;
329   __m128i sum_above = dc_sum_4(above);
330   const __m128i two = _mm_set1_epi16(2);
331   sum_above = _mm_add_epi16(sum_above, two);
332   sum_above = _mm_srai_epi16(sum_above, 2);
333   sum_above = _mm_shufflelo_epi16(sum_above, 0);
334   sum_above = _mm_packus_epi16(sum_above, sum_above);
335 
336   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
337   dc_store_4xh(pred, 16, dst, stride);
338 }
339 
aom_dc_top_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)340 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
341                                    const uint8_t *above, const uint8_t *left) {
342   (void)left;
343   __m128i sum_above = dc_sum_8(above);
344   const __m128i four = _mm_set1_epi16(4);
345   sum_above = _mm_add_epi16(sum_above, four);
346   sum_above = _mm_srai_epi16(sum_above, 3);
347   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
348   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
349   dc_store_8xh(&row, 4, dst, stride);
350 }
351 
aom_dc_top_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)352 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
353                                     const uint8_t *above, const uint8_t *left) {
354   (void)left;
355   __m128i sum_above = dc_sum_8(above);
356   const __m128i four = _mm_set1_epi16(4);
357   sum_above = _mm_add_epi16(sum_above, four);
358   sum_above = _mm_srai_epi16(sum_above, 3);
359   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
360   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
361   dc_store_8xh(&row, 16, dst, stride);
362 }
363 
aom_dc_top_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)364 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
365                                     const uint8_t *above, const uint8_t *left) {
366   (void)left;
367   __m128i sum_above = dc_sum_8(above);
368   const __m128i four = _mm_set1_epi16(4);
369   sum_above = _mm_add_epi16(sum_above, four);
370   sum_above = _mm_srai_epi16(sum_above, 3);
371   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
372   const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
373   dc_store_8xh(&row, 32, dst, stride);
374 }
375 
aom_dc_top_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)376 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
377                                     const uint8_t *above, const uint8_t *left) {
378   (void)left;
379   __m128i sum_above = dc_sum_16_sse2(above);
380   const __m128i eight = _mm_set1_epi16(8);
381   sum_above = _mm_add_epi16(sum_above, eight);
382   sum_above = _mm_srai_epi16(sum_above, 4);
383   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
384   sum_above = _mm_shufflelo_epi16(sum_above, 0);
385   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
386   dc_store_16xh(&row, 4, dst, stride);
387 }
388 
aom_dc_top_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)389 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
390                                     const uint8_t *above, const uint8_t *left) {
391   (void)left;
392   __m128i sum_above = dc_sum_16_sse2(above);
393   const __m128i eight = _mm_set1_epi16(8);
394   sum_above = _mm_add_epi16(sum_above, eight);
395   sum_above = _mm_srai_epi16(sum_above, 4);
396   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
397   sum_above = _mm_shufflelo_epi16(sum_above, 0);
398   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
399   dc_store_16xh(&row, 8, dst, stride);
400 }
401 
aom_dc_top_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)402 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
403                                      const uint8_t *above,
404                                      const uint8_t *left) {
405   (void)left;
406   __m128i sum_above = dc_sum_16_sse2(above);
407   const __m128i eight = _mm_set1_epi16(8);
408   sum_above = _mm_add_epi16(sum_above, eight);
409   sum_above = _mm_srai_epi16(sum_above, 4);
410   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
411   sum_above = _mm_shufflelo_epi16(sum_above, 0);
412   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
413   dc_store_16xh(&row, 32, dst, stride);
414 }
415 
aom_dc_top_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)416 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
417                                      const uint8_t *above,
418                                      const uint8_t *left) {
419   (void)left;
420   __m128i sum_above = dc_sum_16_sse2(above);
421   const __m128i eight = _mm_set1_epi16(8);
422   sum_above = _mm_add_epi16(sum_above, eight);
423   sum_above = _mm_srai_epi16(sum_above, 4);
424   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
425   sum_above = _mm_shufflelo_epi16(sum_above, 0);
426   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
427   dc_store_16xh(&row, 64, dst, stride);
428 }
429 
aom_dc_top_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)430 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
431                                     const uint8_t *above, const uint8_t *left) {
432   (void)left;
433   __m128i sum_above = dc_sum_32_sse2(above);
434   const __m128i sixteen = _mm_set1_epi16(16);
435   sum_above = _mm_add_epi16(sum_above, sixteen);
436   sum_above = _mm_srai_epi16(sum_above, 5);
437   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
438   sum_above = _mm_shufflelo_epi16(sum_above, 0);
439   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
440   dc_store_32xh(&row, 8, dst, stride);
441 }
442 
aom_dc_top_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)443 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
444                                      const uint8_t *above,
445                                      const uint8_t *left) {
446   (void)left;
447   __m128i sum_above = dc_sum_32_sse2(above);
448   const __m128i sixteen = _mm_set1_epi16(16);
449   sum_above = _mm_add_epi16(sum_above, sixteen);
450   sum_above = _mm_srai_epi16(sum_above, 5);
451   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
452   sum_above = _mm_shufflelo_epi16(sum_above, 0);
453   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
454   dc_store_32xh(&row, 16, dst, stride);
455 }
456 
aom_dc_top_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
458                                      const uint8_t *above,
459                                      const uint8_t *left) {
460   (void)left;
461   __m128i sum_above = dc_sum_32_sse2(above);
462   const __m128i sixteen = _mm_set1_epi16(16);
463   sum_above = _mm_add_epi16(sum_above, sixteen);
464   sum_above = _mm_srai_epi16(sum_above, 5);
465   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
466   sum_above = _mm_shufflelo_epi16(sum_above, 0);
467   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
468   dc_store_32xh(&row, 64, dst, stride);
469 }
470 
aom_dc_top_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)471 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
472                                      const uint8_t *above,
473                                      const uint8_t *left) {
474   (void)left;
475   __m128i sum_above = dc_sum_64(above);
476   const __m128i thirtytwo = _mm_set1_epi16(32);
477   sum_above = _mm_add_epi16(sum_above, thirtytwo);
478   sum_above = _mm_srai_epi16(sum_above, 6);
479   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
480   sum_above = _mm_shufflelo_epi16(sum_above, 0);
481   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
482   dc_store_64xh(&row, 64, dst, stride);
483 }
484 
aom_dc_top_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)485 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
486                                      const uint8_t *above,
487                                      const uint8_t *left) {
488   (void)left;
489   __m128i sum_above = dc_sum_64(above);
490   const __m128i thirtytwo = _mm_set1_epi16(32);
491   sum_above = _mm_add_epi16(sum_above, thirtytwo);
492   sum_above = _mm_srai_epi16(sum_above, 6);
493   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
494   sum_above = _mm_shufflelo_epi16(sum_above, 0);
495   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
496   dc_store_64xh(&row, 32, dst, stride);
497 }
498 
aom_dc_top_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
500                                      const uint8_t *above,
501                                      const uint8_t *left) {
502   (void)left;
503   __m128i sum_above = dc_sum_64(above);
504   const __m128i thirtytwo = _mm_set1_epi16(32);
505   sum_above = _mm_add_epi16(sum_above, thirtytwo);
506   sum_above = _mm_srai_epi16(sum_above, 6);
507   sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
508   sum_above = _mm_shufflelo_epi16(sum_above, 0);
509   const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
510   dc_store_64xh(&row, 16, dst, stride);
511 }
512 
513 // -----------------------------------------------------------------------------
514 // DC_LEFT
515 
aom_dc_left_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)516 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
517                                     const uint8_t *above, const uint8_t *left) {
518   (void)above;
519   __m128i sum_left = dc_sum_8(left);
520   const __m128i four = _mm_set1_epi16(4);
521   sum_left = _mm_add_epi16(sum_left, four);
522   sum_left = _mm_srai_epi16(sum_left, 3);
523   sum_left = _mm_shufflelo_epi16(sum_left, 0);
524   sum_left = _mm_packus_epi16(sum_left, sum_left);
525 
526   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
527   dc_store_4xh(pred, 8, dst, stride);
528 }
529 
aom_dc_left_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)530 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
531                                      const uint8_t *above,
532                                      const uint8_t *left) {
533   (void)above;
534   __m128i sum_left = dc_sum_16_sse2(left);
535   const __m128i eight = _mm_set1_epi16(8);
536   sum_left = _mm_add_epi16(sum_left, eight);
537   sum_left = _mm_srai_epi16(sum_left, 4);
538   sum_left = _mm_shufflelo_epi16(sum_left, 0);
539   sum_left = _mm_packus_epi16(sum_left, sum_left);
540 
541   const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
542   dc_store_4xh(pred, 16, dst, stride);
543 }
544 
aom_dc_left_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)545 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
546                                     const uint8_t *above, const uint8_t *left) {
547   (void)above;
548   __m128i sum_left = dc_sum_4(left);
549   const __m128i two = _mm_set1_epi16(2);
550   sum_left = _mm_add_epi16(sum_left, two);
551   sum_left = _mm_srai_epi16(sum_left, 2);
552   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
553   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
554   dc_store_8xh(&row, 4, dst, stride);
555 }
556 
aom_dc_left_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)557 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
558                                      const uint8_t *above,
559                                      const uint8_t *left) {
560   (void)above;
561   __m128i sum_left = dc_sum_16_sse2(left);
562   const __m128i eight = _mm_set1_epi16(8);
563   sum_left = _mm_add_epi16(sum_left, eight);
564   sum_left = _mm_srai_epi16(sum_left, 4);
565   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
566   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
567   dc_store_8xh(&row, 16, dst, stride);
568 }
569 
aom_dc_left_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)570 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
571                                      const uint8_t *above,
572                                      const uint8_t *left) {
573   (void)above;
574   __m128i sum_left = dc_sum_32_sse2(left);
575   const __m128i sixteen = _mm_set1_epi16(16);
576   sum_left = _mm_add_epi16(sum_left, sixteen);
577   sum_left = _mm_srai_epi16(sum_left, 5);
578   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
579   const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
580   dc_store_8xh(&row, 32, dst, stride);
581 }
582 
aom_dc_left_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)583 void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
584                                      const uint8_t *above,
585                                      const uint8_t *left) {
586   (void)above;
587   __m128i sum_left = dc_sum_4(left);
588   const __m128i two = _mm_set1_epi16(2);
589   sum_left = _mm_add_epi16(sum_left, two);
590   sum_left = _mm_srai_epi16(sum_left, 2);
591   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
592   sum_left = _mm_shufflelo_epi16(sum_left, 0);
593   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
594   dc_store_16xh(&row, 4, dst, stride);
595 }
596 
aom_dc_left_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)597 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
598                                      const uint8_t *above,
599                                      const uint8_t *left) {
600   (void)above;
601   __m128i sum_left = dc_sum_8(left);
602   const __m128i four = _mm_set1_epi16(4);
603   sum_left = _mm_add_epi16(sum_left, four);
604   sum_left = _mm_srai_epi16(sum_left, 3);
605   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
606   sum_left = _mm_shufflelo_epi16(sum_left, 0);
607   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
608   dc_store_16xh(&row, 8, dst, stride);
609 }
610 
aom_dc_left_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)611 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
612                                       const uint8_t *above,
613                                       const uint8_t *left) {
614   (void)above;
615   __m128i sum_left = dc_sum_32_sse2(left);
616   const __m128i sixteen = _mm_set1_epi16(16);
617   sum_left = _mm_add_epi16(sum_left, sixteen);
618   sum_left = _mm_srai_epi16(sum_left, 5);
619   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
620   sum_left = _mm_shufflelo_epi16(sum_left, 0);
621   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
622   dc_store_16xh(&row, 32, dst, stride);
623 }
624 
aom_dc_left_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)625 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
626                                       const uint8_t *above,
627                                       const uint8_t *left) {
628   (void)above;
629   __m128i sum_left = dc_sum_64(left);
630   const __m128i thirtytwo = _mm_set1_epi16(32);
631   sum_left = _mm_add_epi16(sum_left, thirtytwo);
632   sum_left = _mm_srai_epi16(sum_left, 6);
633   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
634   sum_left = _mm_shufflelo_epi16(sum_left, 0);
635   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
636   dc_store_16xh(&row, 64, dst, stride);
637 }
638 
aom_dc_left_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)639 void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
640                                      const uint8_t *above,
641                                      const uint8_t *left) {
642   (void)above;
643   __m128i sum_left = dc_sum_8(left);
644   const __m128i four = _mm_set1_epi16(4);
645   sum_left = _mm_add_epi16(sum_left, four);
646   sum_left = _mm_srai_epi16(sum_left, 3);
647   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
648   sum_left = _mm_shufflelo_epi16(sum_left, 0);
649   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
650   dc_store_32xh(&row, 8, dst, stride);
651 }
652 
aom_dc_left_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)653 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
654                                       const uint8_t *above,
655                                       const uint8_t *left) {
656   (void)above;
657   __m128i sum_left = dc_sum_16_sse2(left);
658   const __m128i eight = _mm_set1_epi16(8);
659   sum_left = _mm_add_epi16(sum_left, eight);
660   sum_left = _mm_srai_epi16(sum_left, 4);
661   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
662   sum_left = _mm_shufflelo_epi16(sum_left, 0);
663   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
664   dc_store_32xh(&row, 16, dst, stride);
665 }
666 
aom_dc_left_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)667 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
668                                       const uint8_t *above,
669                                       const uint8_t *left) {
670   (void)above;
671   __m128i sum_left = dc_sum_64(left);
672   const __m128i thirtytwo = _mm_set1_epi16(32);
673   sum_left = _mm_add_epi16(sum_left, thirtytwo);
674   sum_left = _mm_srai_epi16(sum_left, 6);
675   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
676   sum_left = _mm_shufflelo_epi16(sum_left, 0);
677   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
678   dc_store_32xh(&row, 64, dst, stride);
679 }
680 
aom_dc_left_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)681 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
682                                       const uint8_t *above,
683                                       const uint8_t *left) {
684   (void)above;
685   __m128i sum_left = dc_sum_64(left);
686   const __m128i thirtytwo = _mm_set1_epi16(32);
687   sum_left = _mm_add_epi16(sum_left, thirtytwo);
688   sum_left = _mm_srai_epi16(sum_left, 6);
689   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
690   sum_left = _mm_shufflelo_epi16(sum_left, 0);
691   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
692   dc_store_64xh(&row, 64, dst, stride);
693 }
694 
aom_dc_left_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)695 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
696                                       const uint8_t *above,
697                                       const uint8_t *left) {
698   (void)above;
699   __m128i sum_left = dc_sum_32_sse2(left);
700   const __m128i sixteen = _mm_set1_epi16(16);
701   sum_left = _mm_add_epi16(sum_left, sixteen);
702   sum_left = _mm_srai_epi16(sum_left, 5);
703   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
704   sum_left = _mm_shufflelo_epi16(sum_left, 0);
705   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
706   dc_store_64xh(&row, 32, dst, stride);
707 }
708 
aom_dc_left_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)709 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
710                                       const uint8_t *above,
711                                       const uint8_t *left) {
712   (void)above;
713   __m128i sum_left = dc_sum_16_sse2(left);
714   const __m128i eight = _mm_set1_epi16(8);
715   sum_left = _mm_add_epi16(sum_left, eight);
716   sum_left = _mm_srai_epi16(sum_left, 4);
717   sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
718   sum_left = _mm_shufflelo_epi16(sum_left, 0);
719   const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
720   dc_store_64xh(&row, 16, dst, stride);
721 }
722 
723 // -----------------------------------------------------------------------------
724 // DC_128
725 
aom_dc_128_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)726 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
727                                    const uint8_t *above, const uint8_t *left) {
728   (void)above;
729   (void)left;
730   const uint32_t pred = 0x80808080;
731   dc_store_4xh(pred, 8, dst, stride);
732 }
733 
aom_dc_128_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)734 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
735                                     const uint8_t *above, const uint8_t *left) {
736   (void)above;
737   (void)left;
738   const uint32_t pred = 0x80808080;
739   dc_store_4xh(pred, 16, dst, stride);
740 }
741 
aom_dc_128_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)742 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
743                                    const uint8_t *above, const uint8_t *left) {
744   (void)above;
745   (void)left;
746   const __m128i row = _mm_set1_epi8((int8_t)128);
747   dc_store_8xh(&row, 4, dst, stride);
748 }
749 
aom_dc_128_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)750 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
751                                     const uint8_t *above, const uint8_t *left) {
752   (void)above;
753   (void)left;
754   const __m128i row = _mm_set1_epi8((int8_t)128);
755   dc_store_8xh(&row, 16, dst, stride);
756 }
757 
aom_dc_128_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)758 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
759                                     const uint8_t *above, const uint8_t *left) {
760   (void)above;
761   (void)left;
762   const __m128i row = _mm_set1_epi8((int8_t)128);
763   dc_store_8xh(&row, 32, dst, stride);
764 }
765 
aom_dc_128_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)766 void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
767                                     const uint8_t *above, const uint8_t *left) {
768   (void)above;
769   (void)left;
770   const __m128i row = _mm_set1_epi8((int8_t)128);
771   dc_store_16xh(&row, 4, dst, stride);
772 }
773 
aom_dc_128_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)774 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
775                                     const uint8_t *above, const uint8_t *left) {
776   (void)above;
777   (void)left;
778   const __m128i row = _mm_set1_epi8((int8_t)128);
779   dc_store_16xh(&row, 8, dst, stride);
780 }
781 
aom_dc_128_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)782 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
783                                      const uint8_t *above,
784                                      const uint8_t *left) {
785   (void)above;
786   (void)left;
787   const __m128i row = _mm_set1_epi8((int8_t)128);
788   dc_store_16xh(&row, 32, dst, stride);
789 }
790 
aom_dc_128_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)791 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
792                                      const uint8_t *above,
793                                      const uint8_t *left) {
794   (void)above;
795   (void)left;
796   const __m128i row = _mm_set1_epi8((int8_t)128);
797   dc_store_16xh(&row, 64, dst, stride);
798 }
799 
aom_dc_128_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)800 void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
801                                     const uint8_t *above, const uint8_t *left) {
802   (void)above;
803   (void)left;
804   const __m128i row = _mm_set1_epi8((int8_t)128);
805   dc_store_32xh(&row, 8, dst, stride);
806 }
807 
aom_dc_128_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)808 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
809                                      const uint8_t *above,
810                                      const uint8_t *left) {
811   (void)above;
812   (void)left;
813   const __m128i row = _mm_set1_epi8((int8_t)128);
814   dc_store_32xh(&row, 16, dst, stride);
815 }
816 
aom_dc_128_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)817 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
818                                      const uint8_t *above,
819                                      const uint8_t *left) {
820   (void)above;
821   (void)left;
822   const __m128i row = _mm_set1_epi8((int8_t)128);
823   dc_store_32xh(&row, 64, dst, stride);
824 }
825 
aom_dc_128_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)826 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
827                                      const uint8_t *above,
828                                      const uint8_t *left) {
829   (void)above;
830   (void)left;
831   const __m128i row = _mm_set1_epi8((int8_t)128);
832   dc_store_64xh(&row, 64, dst, stride);
833 }
834 
aom_dc_128_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)835 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
836                                      const uint8_t *above,
837                                      const uint8_t *left) {
838   (void)above;
839   (void)left;
840   const __m128i row = _mm_set1_epi8((int8_t)128);
841   dc_store_64xh(&row, 32, dst, stride);
842 }
843 
aom_dc_128_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)844 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
845                                      const uint8_t *above,
846                                      const uint8_t *left) {
847   (void)above;
848   (void)left;
849   const __m128i row = _mm_set1_epi8((int8_t)128);
850   dc_store_64xh(&row, 16, dst, stride);
851 }
852 
853 // -----------------------------------------------------------------------------
854 // V_PRED
855 
aom_v_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)856 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
857                               const uint8_t *above, const uint8_t *left) {
858   const uint32_t pred = *(uint32_t *)above;
859   (void)left;
860   dc_store_4xh(pred, 8, dst, stride);
861 }
862 
aom_v_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)863 void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
864                                const uint8_t *above, const uint8_t *left) {
865   const uint32_t pred = *(uint32_t *)above;
866   (void)left;
867   dc_store_4xh(pred, 16, dst, stride);
868 }
869 
aom_v_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)870 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
871                               const uint8_t *above, const uint8_t *left) {
872   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
873   (void)left;
874   dc_store_8xh(&row, 4, dst, stride);
875 }
876 
aom_v_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)877 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
878                                const uint8_t *above, const uint8_t *left) {
879   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
880   (void)left;
881   dc_store_8xh(&row, 16, dst, stride);
882 }
883 
aom_v_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)884 void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
885                                const uint8_t *above, const uint8_t *left) {
886   const __m128i row = _mm_loadl_epi64((__m128i const *)above);
887   (void)left;
888   dc_store_8xh(&row, 32, dst, stride);
889 }
890 
aom_v_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)891 void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
892                                const uint8_t *above, const uint8_t *left) {
893   const __m128i row = _mm_load_si128((__m128i const *)above);
894   (void)left;
895   dc_store_16xh(&row, 4, dst, stride);
896 }
897 
aom_v_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)898 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
899                                const uint8_t *above, const uint8_t *left) {
900   const __m128i row = _mm_load_si128((__m128i const *)above);
901   (void)left;
902   dc_store_16xh(&row, 8, dst, stride);
903 }
904 
aom_v_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)905 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
906                                 const uint8_t *above, const uint8_t *left) {
907   const __m128i row = _mm_load_si128((__m128i const *)above);
908   (void)left;
909   dc_store_16xh(&row, 32, dst, stride);
910 }
911 
aom_v_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)912 void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
913                                 const uint8_t *above, const uint8_t *left) {
914   const __m128i row = _mm_load_si128((__m128i const *)above);
915   (void)left;
916   dc_store_16xh(&row, 64, dst, stride);
917 }
918 
v_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)919 static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
920                                     const uint8_t *above, int height) {
921   const __m128i row0 = _mm_load_si128((__m128i const *)above);
922   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
923   for (int i = 0; i < height; ++i) {
924     _mm_store_si128((__m128i *)dst, row0);
925     _mm_store_si128((__m128i *)(dst + 16), row1);
926     dst += stride;
927   }
928 }
929 
aom_v_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)930 void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
931                                const uint8_t *above, const uint8_t *left) {
932   (void)left;
933   v_predictor_32xh(dst, stride, above, 8);
934 }
935 
aom_v_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)936 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
937                                 const uint8_t *above, const uint8_t *left) {
938   (void)left;
939   v_predictor_32xh(dst, stride, above, 16);
940 }
941 
aom_v_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)942 void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
943                                 const uint8_t *above, const uint8_t *left) {
944   (void)left;
945   v_predictor_32xh(dst, stride, above, 64);
946 }
947 
v_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)948 static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
949                                     const uint8_t *above, int height) {
950   const __m128i row0 = _mm_load_si128((__m128i const *)above);
951   const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
952   const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
953   const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
954   for (int i = 0; i < height; ++i) {
955     _mm_store_si128((__m128i *)dst, row0);
956     _mm_store_si128((__m128i *)(dst + 16), row1);
957     _mm_store_si128((__m128i *)(dst + 32), row2);
958     _mm_store_si128((__m128i *)(dst + 48), row3);
959     dst += stride;
960   }
961 }
962 
aom_v_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)963 void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
964                                 const uint8_t *above, const uint8_t *left) {
965   (void)left;
966   v_predictor_64xh(dst, stride, above, 64);
967 }
968 
aom_v_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)969 void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
970                                 const uint8_t *above, const uint8_t *left) {
971   (void)left;
972   v_predictor_64xh(dst, stride, above, 32);
973 }
974 
aom_v_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)975 void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
976                                 const uint8_t *above, const uint8_t *left) {
977   (void)left;
978   v_predictor_64xh(dst, stride, above, 16);
979 }
980 
981 // -----------------------------------------------------------------------------
982 // H_PRED
983 
aom_h_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)984 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
985                               const uint8_t *above, const uint8_t *left) {
986   (void)above;
987   __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
988   left_col = _mm_unpacklo_epi8(left_col, left_col);
989   __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
990   __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
991   __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
992   __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
993   *(int *)dst = _mm_cvtsi128_si32(row0);
994   dst += stride;
995   *(int *)dst = _mm_cvtsi128_si32(row1);
996   dst += stride;
997   *(int *)dst = _mm_cvtsi128_si32(row2);
998   dst += stride;
999   *(int *)dst = _mm_cvtsi128_si32(row3);
1000   dst += stride;
1001   left_col = _mm_unpackhi_epi64(left_col, left_col);
1002   row0 = _mm_shufflelo_epi16(left_col, 0);
1003   row1 = _mm_shufflelo_epi16(left_col, 0x55);
1004   row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1005   row3 = _mm_shufflelo_epi16(left_col, 0xff);
1006   *(int *)dst = _mm_cvtsi128_si32(row0);
1007   dst += stride;
1008   *(int *)dst = _mm_cvtsi128_si32(row1);
1009   dst += stride;
1010   *(int *)dst = _mm_cvtsi128_si32(row2);
1011   dst += stride;
1012   *(int *)dst = _mm_cvtsi128_si32(row3);
1013 }
1014 
aom_h_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1015 void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1016                                const uint8_t *above, const uint8_t *left) {
1017   (void)above;
1018   const __m128i left_col = _mm_load_si128((__m128i const *)left);
1019   __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1020   __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1021 
1022   __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1023   __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1024   __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1025   __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1026   *(int *)dst = _mm_cvtsi128_si32(row0);
1027   dst += stride;
1028   *(int *)dst = _mm_cvtsi128_si32(row1);
1029   dst += stride;
1030   *(int *)dst = _mm_cvtsi128_si32(row2);
1031   dst += stride;
1032   *(int *)dst = _mm_cvtsi128_si32(row3);
1033   dst += stride;
1034 
1035   left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1036   row0 = _mm_shufflelo_epi16(left_col_low, 0);
1037   row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1038   row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1039   row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1040   *(int *)dst = _mm_cvtsi128_si32(row0);
1041   dst += stride;
1042   *(int *)dst = _mm_cvtsi128_si32(row1);
1043   dst += stride;
1044   *(int *)dst = _mm_cvtsi128_si32(row2);
1045   dst += stride;
1046   *(int *)dst = _mm_cvtsi128_si32(row3);
1047   dst += stride;
1048 
1049   row0 = _mm_shufflelo_epi16(left_col_high, 0);
1050   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1051   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1052   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1053   *(int *)dst = _mm_cvtsi128_si32(row0);
1054   dst += stride;
1055   *(int *)dst = _mm_cvtsi128_si32(row1);
1056   dst += stride;
1057   *(int *)dst = _mm_cvtsi128_si32(row2);
1058   dst += stride;
1059   *(int *)dst = _mm_cvtsi128_si32(row3);
1060   dst += stride;
1061 
1062   left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1063   row0 = _mm_shufflelo_epi16(left_col_high, 0);
1064   row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1065   row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1066   row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1067   *(int *)dst = _mm_cvtsi128_si32(row0);
1068   dst += stride;
1069   *(int *)dst = _mm_cvtsi128_si32(row1);
1070   dst += stride;
1071   *(int *)dst = _mm_cvtsi128_si32(row2);
1072   dst += stride;
1073   *(int *)dst = _mm_cvtsi128_si32(row3);
1074 }
1075 
aom_h_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1076 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1077                               const uint8_t *above, const uint8_t *left) {
1078   (void)above;
1079   __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1080   left_col = _mm_unpacklo_epi8(left_col, left_col);
1081   __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1082   __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1083   __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1084   __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1085   _mm_storel_epi64((__m128i *)dst, row0);
1086   dst += stride;
1087   _mm_storel_epi64((__m128i *)dst, row1);
1088   dst += stride;
1089   _mm_storel_epi64((__m128i *)dst, row2);
1090   dst += stride;
1091   _mm_storel_epi64((__m128i *)dst, row3);
1092 }
1093 
h_predictor_8x16xc(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int count)1094 static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
1095                                       const uint8_t *above, const uint8_t *left,
1096                                       int count) {
1097   (void)above;
1098   for (int i = 0; i < count; ++i) {
1099     const __m128i left_col = _mm_load_si128((__m128i const *)left);
1100     __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1101     __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1102 
1103     __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1104     __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1105     __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1106     __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1107     _mm_storel_epi64((__m128i *)dst, row0);
1108     dst += stride;
1109     _mm_storel_epi64((__m128i *)dst, row1);
1110     dst += stride;
1111     _mm_storel_epi64((__m128i *)dst, row2);
1112     dst += stride;
1113     _mm_storel_epi64((__m128i *)dst, row3);
1114     dst += stride;
1115 
1116     left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1117     row0 = _mm_shufflelo_epi16(left_col_low, 0);
1118     row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1119     row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1120     row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1121     _mm_storel_epi64((__m128i *)dst, row0);
1122     dst += stride;
1123     _mm_storel_epi64((__m128i *)dst, row1);
1124     dst += stride;
1125     _mm_storel_epi64((__m128i *)dst, row2);
1126     dst += stride;
1127     _mm_storel_epi64((__m128i *)dst, row3);
1128     dst += stride;
1129 
1130     row0 = _mm_shufflelo_epi16(left_col_high, 0);
1131     row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1132     row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1133     row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1134     _mm_storel_epi64((__m128i *)dst, row0);
1135     dst += stride;
1136     _mm_storel_epi64((__m128i *)dst, row1);
1137     dst += stride;
1138     _mm_storel_epi64((__m128i *)dst, row2);
1139     dst += stride;
1140     _mm_storel_epi64((__m128i *)dst, row3);
1141     dst += stride;
1142 
1143     left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1144     row0 = _mm_shufflelo_epi16(left_col_high, 0);
1145     row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1146     row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1147     row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1148     _mm_storel_epi64((__m128i *)dst, row0);
1149     dst += stride;
1150     _mm_storel_epi64((__m128i *)dst, row1);
1151     dst += stride;
1152     _mm_storel_epi64((__m128i *)dst, row2);
1153     dst += stride;
1154     _mm_storel_epi64((__m128i *)dst, row3);
1155     dst += stride;
1156     left += 16;
1157   }
1158 }
1159 
aom_h_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1160 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1161                                const uint8_t *above, const uint8_t *left) {
1162   h_predictor_8x16xc(dst, stride, above, left, 1);
1163 }
1164 
aom_h_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1165 void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1166                                const uint8_t *above, const uint8_t *left) {
1167   h_predictor_8x16xc(dst, stride, above, left, 2);
1168 }
1169 
h_pred_store_16xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1170 static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1171                                      ptrdiff_t stride) {
1172   int i;
1173   for (i = 0; i < h; ++i) {
1174     _mm_store_si128((__m128i *)dst, row[i]);
1175     dst += stride;
1176   }
1177 }
1178 
repeat_low_4pixels(const __m128i * x,__m128i * row)1179 static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1180   const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1181   const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1182   const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1183   const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1184 
1185   row[0] = _mm_unpacklo_epi64(u0, u0);
1186   row[1] = _mm_unpacklo_epi64(u1, u1);
1187   row[2] = _mm_unpacklo_epi64(u2, u2);
1188   row[3] = _mm_unpacklo_epi64(u3, u3);
1189 }
1190 
repeat_high_4pixels(const __m128i * x,__m128i * row)1191 static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1192   const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1193   const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1194   const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1195   const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1196 
1197   row[0] = _mm_unpackhi_epi64(u0, u0);
1198   row[1] = _mm_unpackhi_epi64(u1, u1);
1199   row[2] = _mm_unpackhi_epi64(u2, u2);
1200   row[3] = _mm_unpackhi_epi64(u3, u3);
1201 }
1202 
1203 // Process 16x8, first 4 rows
1204 // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_16x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1205 static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1206                                        ptrdiff_t stride) {
1207   __m128i row[4];
1208   repeat_low_4pixels(left, row);
1209   h_pred_store_16xh(row, 4, dst, stride);
1210 }
1211 
1212 // Process 16x8, second 4 rows
1213 // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_16x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1214 static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1215                                        ptrdiff_t stride) {
1216   __m128i row[4];
1217   repeat_high_4pixels(left, row);
1218   h_pred_store_16xh(row, 4, dst, stride);
1219 }
1220 
aom_h_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1221 void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1222                                const uint8_t *above, const uint8_t *left) {
1223   (void)above;
1224   const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1225   const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1226   h_prediction_16x8_1(&left_col_8p, dst, stride);
1227 }
1228 
aom_h_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1229 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1230                                const uint8_t *above, const uint8_t *left) {
1231   (void)above;
1232   const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1233   const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1234   h_prediction_16x8_1(&left_col_8p, dst, stride);
1235   dst += stride << 2;
1236   h_prediction_16x8_2(&left_col_8p, dst, stride);
1237 }
1238 
h_predictor_16xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int count)1239 static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1240                                     const uint8_t *left, int count) {
1241   int i = 0;
1242   do {
1243     const __m128i left_col = _mm_load_si128((const __m128i *)left);
1244     const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1245     h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1246     dst += stride << 2;
1247     h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1248     dst += stride << 2;
1249 
1250     const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1251     h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1252     dst += stride << 2;
1253     h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1254     dst += stride << 2;
1255 
1256     left += 16;
1257     i++;
1258   } while (i < count);
1259 }
1260 
aom_h_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1261 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1262                                 const uint8_t *above, const uint8_t *left) {
1263   (void)above;
1264   h_predictor_16xh(dst, stride, left, 2);
1265 }
1266 
aom_h_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1267 void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1268                                 const uint8_t *above, const uint8_t *left) {
1269   (void)above;
1270   h_predictor_16xh(dst, stride, left, 4);
1271 }
1272 
h_pred_store_32xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1273 static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1274                                      ptrdiff_t stride) {
1275   int i;
1276   for (i = 0; i < h; ++i) {
1277     _mm_store_si128((__m128i *)dst, row[i]);
1278     _mm_store_si128((__m128i *)(dst + 16), row[i]);
1279     dst += stride;
1280   }
1281 }
1282 
1283 // Process 32x8, first 4 rows
1284 // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_32x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1285 static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1286                                        ptrdiff_t stride) {
1287   __m128i row[4];
1288   repeat_low_4pixels(left, row);
1289   h_pred_store_32xh(row, 4, dst, stride);
1290 }
1291 
1292 // Process 32x8, second 4 rows
1293 // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_32x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1294 static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1295                                        ptrdiff_t stride) {
1296   __m128i row[4];
1297   repeat_high_4pixels(left, row);
1298   h_pred_store_32xh(row, 4, dst, stride);
1299 }
1300 
aom_h_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1301 void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1302                                const uint8_t *above, const uint8_t *left) {
1303   __m128i left_col, left_col_8p;
1304   (void)above;
1305 
1306   left_col = _mm_load_si128((const __m128i *)left);
1307 
1308   left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1309   h_prediction_32x8_1(&left_col_8p, dst, stride);
1310   dst += stride << 2;
1311   h_prediction_32x8_2(&left_col_8p, dst, stride);
1312 }
1313 
aom_h_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1314 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1315                                 const uint8_t *above, const uint8_t *left) {
1316   __m128i left_col, left_col_8p;
1317   (void)above;
1318 
1319   left_col = _mm_load_si128((const __m128i *)left);
1320 
1321   left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1322   h_prediction_32x8_1(&left_col_8p, dst, stride);
1323   dst += stride << 2;
1324   h_prediction_32x8_2(&left_col_8p, dst, stride);
1325   dst += stride << 2;
1326 
1327   left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1328   h_prediction_32x8_1(&left_col_8p, dst, stride);
1329   dst += stride << 2;
1330   h_prediction_32x8_2(&left_col_8p, dst, stride);
1331 }
1332 
h_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1333 static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1334                                     const uint8_t *left, int height) {
1335   int i = height >> 2;
1336   do {
1337     __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1338     left4 = _mm_unpacklo_epi8(left4, left4);
1339     left4 = _mm_unpacklo_epi8(left4, left4);
1340     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1341     const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1342     _mm_store_si128((__m128i *)dst, r0);
1343     _mm_store_si128((__m128i *)(dst + 16), r0);
1344     _mm_store_si128((__m128i *)(dst + stride), r1);
1345     _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1346     const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1347     const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1348     _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1349     _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1350     _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1351     _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1352     left += 4;
1353     dst += stride * 4;
1354   } while (--i);
1355 }
1356 
aom_h_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1357 void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1358                                 const uint8_t *above, const uint8_t *left) {
1359   (void)above;
1360   h_predictor_32xh(dst, stride, left, 64);
1361 }
1362 
h_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1363 static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1364                                     const uint8_t *left, int height) {
1365   int i = height >> 2;
1366   do {
1367     __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1368     left4 = _mm_unpacklo_epi8(left4, left4);
1369     left4 = _mm_unpacklo_epi8(left4, left4);
1370     const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1371     const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1372     _mm_store_si128((__m128i *)dst, r0);
1373     _mm_store_si128((__m128i *)(dst + 16), r0);
1374     _mm_store_si128((__m128i *)(dst + 32), r0);
1375     _mm_store_si128((__m128i *)(dst + 48), r0);
1376     _mm_store_si128((__m128i *)(dst + stride), r1);
1377     _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1378     _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1379     _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1380     const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1381     const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1382     _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1383     _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1384     _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1385     _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1386     _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1387     _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1388     _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1389     _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1390     left += 4;
1391     dst += stride * 4;
1392   } while (--i);
1393 }
1394 
aom_h_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1395 void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1396                                 const uint8_t *above, const uint8_t *left) {
1397   (void)above;
1398   h_predictor_64xh(dst, stride, left, 64);
1399 }
1400 
aom_h_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1401 void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1402                                 const uint8_t *above, const uint8_t *left) {
1403   (void)above;
1404   h_predictor_64xh(dst, stride, left, 32);
1405 }
1406 
aom_h_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1407 void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1408                                 const uint8_t *above, const uint8_t *left) {
1409   (void)above;
1410   h_predictor_64xh(dst, stride, left, 16);
1411 }
1412