1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13 #include "aom_dsp/x86/intrapred_x86.h"
14 #include "config/aom_dsp_rtcd.h"
15
dc_store_4xh(uint32_t dc,int height,uint8_t * dst,ptrdiff_t stride)16 static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
17 ptrdiff_t stride) {
18 for (int i = 0; i < height; i += 2) {
19 *(uint32_t *)dst = dc;
20 dst += stride;
21 *(uint32_t *)dst = dc;
22 dst += stride;
23 }
24 }
25
dc_store_8xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)26 static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
27 ptrdiff_t stride) {
28 int i;
29 for (i = 0; i < height; ++i) {
30 _mm_storel_epi64((__m128i *)dst, *row);
31 dst += stride;
32 }
33 }
34
dc_store_16xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)35 static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
36 ptrdiff_t stride) {
37 int i;
38 for (i = 0; i < height; ++i) {
39 _mm_store_si128((__m128i *)dst, *row);
40 dst += stride;
41 }
42 }
43
dc_store_32xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)44 static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
45 ptrdiff_t stride) {
46 int i;
47 for (i = 0; i < height; ++i) {
48 _mm_store_si128((__m128i *)dst, *row);
49 _mm_store_si128((__m128i *)(dst + 16), *row);
50 dst += stride;
51 }
52 }
53
dc_store_64xh(const __m128i * row,int height,uint8_t * dst,ptrdiff_t stride)54 static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
55 ptrdiff_t stride) {
56 for (int i = 0; i < height; ++i) {
57 _mm_store_si128((__m128i *)dst, *row);
58 _mm_store_si128((__m128i *)(dst + 16), *row);
59 _mm_store_si128((__m128i *)(dst + 32), *row);
60 _mm_store_si128((__m128i *)(dst + 48), *row);
61 dst += stride;
62 }
63 }
64
dc_sum_4(const uint8_t * ref)65 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
66 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
67 const __m128i zero = _mm_setzero_si128();
68 x = _mm_unpacklo_epi8(x, zero);
69 return _mm_sad_epu8(x, zero);
70 }
71
dc_sum_8(const uint8_t * ref)72 static INLINE __m128i dc_sum_8(const uint8_t *ref) {
73 __m128i x = _mm_loadl_epi64((__m128i const *)ref);
74 const __m128i zero = _mm_setzero_si128();
75 return _mm_sad_epu8(x, zero);
76 }
77
dc_sum_64(const uint8_t * ref)78 static INLINE __m128i dc_sum_64(const uint8_t *ref) {
79 __m128i x0 = _mm_load_si128((__m128i const *)ref);
80 __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
81 __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
82 __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
83 const __m128i zero = _mm_setzero_si128();
84 x0 = _mm_sad_epu8(x0, zero);
85 x1 = _mm_sad_epu8(x1, zero);
86 x2 = _mm_sad_epu8(x2, zero);
87 x3 = _mm_sad_epu8(x3, zero);
88 x0 = _mm_add_epi16(x0, x1);
89 x2 = _mm_add_epi16(x2, x3);
90 x0 = _mm_add_epi16(x0, x2);
91 const __m128i high = _mm_unpackhi_epi64(x0, x0);
92 return _mm_add_epi16(x0, high);
93 }
94
95 #define DC_MULTIPLIER_1X2 0x5556
96 #define DC_MULTIPLIER_1X4 0x3334
97
98 #define DC_SHIFT2 16
99
divide_using_multiply_shift(int num,int shift1,int multiplier)100 static INLINE int divide_using_multiply_shift(int num, int shift1,
101 int multiplier) {
102 const int interm = num >> shift1;
103 return interm * multiplier >> DC_SHIFT2;
104 }
105
106 // -----------------------------------------------------------------------------
107 // DC_PRED
108
aom_dc_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)109 void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
110 const uint8_t *above, const uint8_t *left) {
111 const __m128i sum_left = dc_sum_8(left);
112 __m128i sum_above = dc_sum_4(above);
113 sum_above = _mm_add_epi16(sum_left, sum_above);
114
115 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
116 sum += 6;
117 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
118
119 const __m128i row = _mm_set1_epi8((int8_t)sum);
120 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
121 dc_store_4xh(pred, 8, dst, stride);
122 }
123
aom_dc_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)124 void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
125 const uint8_t *above, const uint8_t *left) {
126 const __m128i sum_left = dc_sum_16_sse2(left);
127 __m128i sum_above = dc_sum_4(above);
128 sum_above = _mm_add_epi16(sum_left, sum_above);
129
130 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
131 sum += 10;
132 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
133
134 const __m128i row = _mm_set1_epi8((int8_t)sum);
135 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
136 dc_store_4xh(pred, 16, dst, stride);
137 }
138
aom_dc_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)139 void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
140 const uint8_t *above, const uint8_t *left) {
141 const __m128i sum_left = dc_sum_4(left);
142 __m128i sum_above = dc_sum_8(above);
143 sum_above = _mm_add_epi16(sum_above, sum_left);
144
145 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
146 sum += 6;
147 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
148
149 const __m128i row = _mm_set1_epi8((int8_t)sum);
150 dc_store_8xh(&row, 4, dst, stride);
151 }
152
aom_dc_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)153 void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
154 const uint8_t *above, const uint8_t *left) {
155 const __m128i sum_left = dc_sum_16_sse2(left);
156 __m128i sum_above = dc_sum_8(above);
157 sum_above = _mm_add_epi16(sum_above, sum_left);
158
159 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
160 sum += 12;
161 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
162 const __m128i row = _mm_set1_epi8((int8_t)sum);
163 dc_store_8xh(&row, 16, dst, stride);
164 }
165
aom_dc_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)166 void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
167 const uint8_t *above, const uint8_t *left) {
168 const __m128i sum_left = dc_sum_32_sse2(left);
169 __m128i sum_above = dc_sum_8(above);
170 sum_above = _mm_add_epi16(sum_above, sum_left);
171
172 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
173 sum += 20;
174 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
175 const __m128i row = _mm_set1_epi8((int8_t)sum);
176 dc_store_8xh(&row, 32, dst, stride);
177 }
178
aom_dc_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)179 void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
180 const uint8_t *above, const uint8_t *left) {
181 const __m128i sum_left = dc_sum_4(left);
182 __m128i sum_above = dc_sum_16_sse2(above);
183 sum_above = _mm_add_epi16(sum_above, sum_left);
184
185 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
186 sum += 10;
187 sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
188 const __m128i row = _mm_set1_epi8((int8_t)sum);
189 dc_store_16xh(&row, 4, dst, stride);
190 }
191
aom_dc_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)192 void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
193 const uint8_t *above, const uint8_t *left) {
194 const __m128i sum_left = dc_sum_8(left);
195 __m128i sum_above = dc_sum_16_sse2(above);
196 sum_above = _mm_add_epi16(sum_above, sum_left);
197
198 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
199 sum += 12;
200 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
201 const __m128i row = _mm_set1_epi8((int8_t)sum);
202 dc_store_16xh(&row, 8, dst, stride);
203 }
204
aom_dc_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)205 void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
206 const uint8_t *above, const uint8_t *left) {
207 const __m128i sum_left = dc_sum_32_sse2(left);
208 __m128i sum_above = dc_sum_16_sse2(above);
209 sum_above = _mm_add_epi16(sum_left, sum_above);
210
211 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
212 sum += 24;
213 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
214 const __m128i row = _mm_set1_epi8((int8_t)sum);
215 dc_store_16xh(&row, 32, dst, stride);
216 }
217
aom_dc_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)218 void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
219 const uint8_t *above, const uint8_t *left) {
220 const __m128i sum_left = dc_sum_64(left);
221 __m128i sum_above = dc_sum_16_sse2(above);
222 sum_above = _mm_add_epi16(sum_left, sum_above);
223
224 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
225 sum += 40;
226 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
227 const __m128i row = _mm_set1_epi8((int8_t)sum);
228 dc_store_16xh(&row, 64, dst, stride);
229 }
230
aom_dc_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)231 void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
232 const uint8_t *above, const uint8_t *left) {
233 __m128i sum_above = dc_sum_32_sse2(above);
234 const __m128i sum_left = dc_sum_8(left);
235 sum_above = _mm_add_epi16(sum_above, sum_left);
236
237 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
238 sum += 20;
239 sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
240 const __m128i row = _mm_set1_epi8((int8_t)sum);
241 dc_store_32xh(&row, 8, dst, stride);
242 }
243
aom_dc_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)244 void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
245 const uint8_t *above, const uint8_t *left) {
246 __m128i sum_above = dc_sum_32_sse2(above);
247 const __m128i sum_left = dc_sum_16_sse2(left);
248 sum_above = _mm_add_epi16(sum_above, sum_left);
249
250 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
251 sum += 24;
252 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
253 const __m128i row = _mm_set1_epi8((int8_t)sum);
254 dc_store_32xh(&row, 16, dst, stride);
255 }
256
aom_dc_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)257 void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
258 const uint8_t *above, const uint8_t *left) {
259 __m128i sum_above = dc_sum_32_sse2(above);
260 const __m128i sum_left = dc_sum_64(left);
261 sum_above = _mm_add_epi16(sum_above, sum_left);
262
263 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
264 sum += 48;
265 sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
266 const __m128i row = _mm_set1_epi8((int8_t)sum);
267 dc_store_32xh(&row, 64, dst, stride);
268 }
269
aom_dc_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)270 void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
271 const uint8_t *above, const uint8_t *left) {
272 __m128i sum_above = dc_sum_64(above);
273 const __m128i sum_left = dc_sum_64(left);
274 sum_above = _mm_add_epi16(sum_above, sum_left);
275
276 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
277 sum += 64;
278 sum /= 128;
279 const __m128i row = _mm_set1_epi8((int8_t)sum);
280 dc_store_64xh(&row, 64, dst, stride);
281 }
282
aom_dc_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)283 void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
284 const uint8_t *above, const uint8_t *left) {
285 __m128i sum_above = dc_sum_64(above);
286 const __m128i sum_left = dc_sum_32_sse2(left);
287 sum_above = _mm_add_epi16(sum_above, sum_left);
288
289 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
290 sum += 48;
291 sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
292 const __m128i row = _mm_set1_epi8((int8_t)sum);
293 dc_store_64xh(&row, 32, dst, stride);
294 }
295
aom_dc_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)296 void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
297 const uint8_t *above, const uint8_t *left) {
298 __m128i sum_above = dc_sum_64(above);
299 const __m128i sum_left = dc_sum_16_sse2(left);
300 sum_above = _mm_add_epi16(sum_above, sum_left);
301
302 uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
303 sum += 40;
304 sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
305 const __m128i row = _mm_set1_epi8((int8_t)sum);
306 dc_store_64xh(&row, 16, dst, stride);
307 }
308
309 // -----------------------------------------------------------------------------
310 // DC_TOP
311
aom_dc_top_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)312 void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
313 const uint8_t *above, const uint8_t *left) {
314 (void)left;
315 __m128i sum_above = dc_sum_4(above);
316 const __m128i two = _mm_set1_epi16(2);
317 sum_above = _mm_add_epi16(sum_above, two);
318 sum_above = _mm_srai_epi16(sum_above, 2);
319 sum_above = _mm_shufflelo_epi16(sum_above, 0);
320 sum_above = _mm_packus_epi16(sum_above, sum_above);
321
322 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
323 dc_store_4xh(pred, 8, dst, stride);
324 }
325
aom_dc_top_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)326 void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
327 const uint8_t *above, const uint8_t *left) {
328 (void)left;
329 __m128i sum_above = dc_sum_4(above);
330 const __m128i two = _mm_set1_epi16(2);
331 sum_above = _mm_add_epi16(sum_above, two);
332 sum_above = _mm_srai_epi16(sum_above, 2);
333 sum_above = _mm_shufflelo_epi16(sum_above, 0);
334 sum_above = _mm_packus_epi16(sum_above, sum_above);
335
336 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
337 dc_store_4xh(pred, 16, dst, stride);
338 }
339
aom_dc_top_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)340 void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
341 const uint8_t *above, const uint8_t *left) {
342 (void)left;
343 __m128i sum_above = dc_sum_8(above);
344 const __m128i four = _mm_set1_epi16(4);
345 sum_above = _mm_add_epi16(sum_above, four);
346 sum_above = _mm_srai_epi16(sum_above, 3);
347 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
348 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
349 dc_store_8xh(&row, 4, dst, stride);
350 }
351
aom_dc_top_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)352 void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
353 const uint8_t *above, const uint8_t *left) {
354 (void)left;
355 __m128i sum_above = dc_sum_8(above);
356 const __m128i four = _mm_set1_epi16(4);
357 sum_above = _mm_add_epi16(sum_above, four);
358 sum_above = _mm_srai_epi16(sum_above, 3);
359 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
360 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
361 dc_store_8xh(&row, 16, dst, stride);
362 }
363
aom_dc_top_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)364 void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
365 const uint8_t *above, const uint8_t *left) {
366 (void)left;
367 __m128i sum_above = dc_sum_8(above);
368 const __m128i four = _mm_set1_epi16(4);
369 sum_above = _mm_add_epi16(sum_above, four);
370 sum_above = _mm_srai_epi16(sum_above, 3);
371 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
372 const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
373 dc_store_8xh(&row, 32, dst, stride);
374 }
375
aom_dc_top_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)376 void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
377 const uint8_t *above, const uint8_t *left) {
378 (void)left;
379 __m128i sum_above = dc_sum_16_sse2(above);
380 const __m128i eight = _mm_set1_epi16(8);
381 sum_above = _mm_add_epi16(sum_above, eight);
382 sum_above = _mm_srai_epi16(sum_above, 4);
383 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
384 sum_above = _mm_shufflelo_epi16(sum_above, 0);
385 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
386 dc_store_16xh(&row, 4, dst, stride);
387 }
388
aom_dc_top_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)389 void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
390 const uint8_t *above, const uint8_t *left) {
391 (void)left;
392 __m128i sum_above = dc_sum_16_sse2(above);
393 const __m128i eight = _mm_set1_epi16(8);
394 sum_above = _mm_add_epi16(sum_above, eight);
395 sum_above = _mm_srai_epi16(sum_above, 4);
396 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
397 sum_above = _mm_shufflelo_epi16(sum_above, 0);
398 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
399 dc_store_16xh(&row, 8, dst, stride);
400 }
401
aom_dc_top_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)402 void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
403 const uint8_t *above,
404 const uint8_t *left) {
405 (void)left;
406 __m128i sum_above = dc_sum_16_sse2(above);
407 const __m128i eight = _mm_set1_epi16(8);
408 sum_above = _mm_add_epi16(sum_above, eight);
409 sum_above = _mm_srai_epi16(sum_above, 4);
410 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
411 sum_above = _mm_shufflelo_epi16(sum_above, 0);
412 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
413 dc_store_16xh(&row, 32, dst, stride);
414 }
415
aom_dc_top_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)416 void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
417 const uint8_t *above,
418 const uint8_t *left) {
419 (void)left;
420 __m128i sum_above = dc_sum_16_sse2(above);
421 const __m128i eight = _mm_set1_epi16(8);
422 sum_above = _mm_add_epi16(sum_above, eight);
423 sum_above = _mm_srai_epi16(sum_above, 4);
424 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
425 sum_above = _mm_shufflelo_epi16(sum_above, 0);
426 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
427 dc_store_16xh(&row, 64, dst, stride);
428 }
429
aom_dc_top_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)430 void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
431 const uint8_t *above, const uint8_t *left) {
432 (void)left;
433 __m128i sum_above = dc_sum_32_sse2(above);
434 const __m128i sixteen = _mm_set1_epi16(16);
435 sum_above = _mm_add_epi16(sum_above, sixteen);
436 sum_above = _mm_srai_epi16(sum_above, 5);
437 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
438 sum_above = _mm_shufflelo_epi16(sum_above, 0);
439 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
440 dc_store_32xh(&row, 8, dst, stride);
441 }
442
aom_dc_top_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)443 void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
444 const uint8_t *above,
445 const uint8_t *left) {
446 (void)left;
447 __m128i sum_above = dc_sum_32_sse2(above);
448 const __m128i sixteen = _mm_set1_epi16(16);
449 sum_above = _mm_add_epi16(sum_above, sixteen);
450 sum_above = _mm_srai_epi16(sum_above, 5);
451 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
452 sum_above = _mm_shufflelo_epi16(sum_above, 0);
453 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
454 dc_store_32xh(&row, 16, dst, stride);
455 }
456
aom_dc_top_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)457 void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
458 const uint8_t *above,
459 const uint8_t *left) {
460 (void)left;
461 __m128i sum_above = dc_sum_32_sse2(above);
462 const __m128i sixteen = _mm_set1_epi16(16);
463 sum_above = _mm_add_epi16(sum_above, sixteen);
464 sum_above = _mm_srai_epi16(sum_above, 5);
465 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
466 sum_above = _mm_shufflelo_epi16(sum_above, 0);
467 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
468 dc_store_32xh(&row, 64, dst, stride);
469 }
470
aom_dc_top_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)471 void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
472 const uint8_t *above,
473 const uint8_t *left) {
474 (void)left;
475 __m128i sum_above = dc_sum_64(above);
476 const __m128i thirtytwo = _mm_set1_epi16(32);
477 sum_above = _mm_add_epi16(sum_above, thirtytwo);
478 sum_above = _mm_srai_epi16(sum_above, 6);
479 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
480 sum_above = _mm_shufflelo_epi16(sum_above, 0);
481 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
482 dc_store_64xh(&row, 64, dst, stride);
483 }
484
aom_dc_top_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)485 void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
486 const uint8_t *above,
487 const uint8_t *left) {
488 (void)left;
489 __m128i sum_above = dc_sum_64(above);
490 const __m128i thirtytwo = _mm_set1_epi16(32);
491 sum_above = _mm_add_epi16(sum_above, thirtytwo);
492 sum_above = _mm_srai_epi16(sum_above, 6);
493 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
494 sum_above = _mm_shufflelo_epi16(sum_above, 0);
495 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
496 dc_store_64xh(&row, 32, dst, stride);
497 }
498
aom_dc_top_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)499 void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
500 const uint8_t *above,
501 const uint8_t *left) {
502 (void)left;
503 __m128i sum_above = dc_sum_64(above);
504 const __m128i thirtytwo = _mm_set1_epi16(32);
505 sum_above = _mm_add_epi16(sum_above, thirtytwo);
506 sum_above = _mm_srai_epi16(sum_above, 6);
507 sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
508 sum_above = _mm_shufflelo_epi16(sum_above, 0);
509 const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
510 dc_store_64xh(&row, 16, dst, stride);
511 }
512
513 // -----------------------------------------------------------------------------
514 // DC_LEFT
515
aom_dc_left_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)516 void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
517 const uint8_t *above, const uint8_t *left) {
518 (void)above;
519 __m128i sum_left = dc_sum_8(left);
520 const __m128i four = _mm_set1_epi16(4);
521 sum_left = _mm_add_epi16(sum_left, four);
522 sum_left = _mm_srai_epi16(sum_left, 3);
523 sum_left = _mm_shufflelo_epi16(sum_left, 0);
524 sum_left = _mm_packus_epi16(sum_left, sum_left);
525
526 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
527 dc_store_4xh(pred, 8, dst, stride);
528 }
529
aom_dc_left_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)530 void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
531 const uint8_t *above,
532 const uint8_t *left) {
533 (void)above;
534 __m128i sum_left = dc_sum_16_sse2(left);
535 const __m128i eight = _mm_set1_epi16(8);
536 sum_left = _mm_add_epi16(sum_left, eight);
537 sum_left = _mm_srai_epi16(sum_left, 4);
538 sum_left = _mm_shufflelo_epi16(sum_left, 0);
539 sum_left = _mm_packus_epi16(sum_left, sum_left);
540
541 const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
542 dc_store_4xh(pred, 16, dst, stride);
543 }
544
aom_dc_left_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)545 void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
546 const uint8_t *above, const uint8_t *left) {
547 (void)above;
548 __m128i sum_left = dc_sum_4(left);
549 const __m128i two = _mm_set1_epi16(2);
550 sum_left = _mm_add_epi16(sum_left, two);
551 sum_left = _mm_srai_epi16(sum_left, 2);
552 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
553 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
554 dc_store_8xh(&row, 4, dst, stride);
555 }
556
aom_dc_left_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)557 void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
558 const uint8_t *above,
559 const uint8_t *left) {
560 (void)above;
561 __m128i sum_left = dc_sum_16_sse2(left);
562 const __m128i eight = _mm_set1_epi16(8);
563 sum_left = _mm_add_epi16(sum_left, eight);
564 sum_left = _mm_srai_epi16(sum_left, 4);
565 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
566 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
567 dc_store_8xh(&row, 16, dst, stride);
568 }
569
aom_dc_left_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)570 void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
571 const uint8_t *above,
572 const uint8_t *left) {
573 (void)above;
574 __m128i sum_left = dc_sum_32_sse2(left);
575 const __m128i sixteen = _mm_set1_epi16(16);
576 sum_left = _mm_add_epi16(sum_left, sixteen);
577 sum_left = _mm_srai_epi16(sum_left, 5);
578 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
579 const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
580 dc_store_8xh(&row, 32, dst, stride);
581 }
582
aom_dc_left_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)583 void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
584 const uint8_t *above,
585 const uint8_t *left) {
586 (void)above;
587 __m128i sum_left = dc_sum_4(left);
588 const __m128i two = _mm_set1_epi16(2);
589 sum_left = _mm_add_epi16(sum_left, two);
590 sum_left = _mm_srai_epi16(sum_left, 2);
591 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
592 sum_left = _mm_shufflelo_epi16(sum_left, 0);
593 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
594 dc_store_16xh(&row, 4, dst, stride);
595 }
596
aom_dc_left_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)597 void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
598 const uint8_t *above,
599 const uint8_t *left) {
600 (void)above;
601 __m128i sum_left = dc_sum_8(left);
602 const __m128i four = _mm_set1_epi16(4);
603 sum_left = _mm_add_epi16(sum_left, four);
604 sum_left = _mm_srai_epi16(sum_left, 3);
605 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
606 sum_left = _mm_shufflelo_epi16(sum_left, 0);
607 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
608 dc_store_16xh(&row, 8, dst, stride);
609 }
610
aom_dc_left_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)611 void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
612 const uint8_t *above,
613 const uint8_t *left) {
614 (void)above;
615 __m128i sum_left = dc_sum_32_sse2(left);
616 const __m128i sixteen = _mm_set1_epi16(16);
617 sum_left = _mm_add_epi16(sum_left, sixteen);
618 sum_left = _mm_srai_epi16(sum_left, 5);
619 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
620 sum_left = _mm_shufflelo_epi16(sum_left, 0);
621 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
622 dc_store_16xh(&row, 32, dst, stride);
623 }
624
aom_dc_left_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)625 void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
626 const uint8_t *above,
627 const uint8_t *left) {
628 (void)above;
629 __m128i sum_left = dc_sum_64(left);
630 const __m128i thirtytwo = _mm_set1_epi16(32);
631 sum_left = _mm_add_epi16(sum_left, thirtytwo);
632 sum_left = _mm_srai_epi16(sum_left, 6);
633 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
634 sum_left = _mm_shufflelo_epi16(sum_left, 0);
635 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
636 dc_store_16xh(&row, 64, dst, stride);
637 }
638
aom_dc_left_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)639 void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
640 const uint8_t *above,
641 const uint8_t *left) {
642 (void)above;
643 __m128i sum_left = dc_sum_8(left);
644 const __m128i four = _mm_set1_epi16(4);
645 sum_left = _mm_add_epi16(sum_left, four);
646 sum_left = _mm_srai_epi16(sum_left, 3);
647 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
648 sum_left = _mm_shufflelo_epi16(sum_left, 0);
649 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
650 dc_store_32xh(&row, 8, dst, stride);
651 }
652
aom_dc_left_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)653 void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
654 const uint8_t *above,
655 const uint8_t *left) {
656 (void)above;
657 __m128i sum_left = dc_sum_16_sse2(left);
658 const __m128i eight = _mm_set1_epi16(8);
659 sum_left = _mm_add_epi16(sum_left, eight);
660 sum_left = _mm_srai_epi16(sum_left, 4);
661 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
662 sum_left = _mm_shufflelo_epi16(sum_left, 0);
663 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
664 dc_store_32xh(&row, 16, dst, stride);
665 }
666
aom_dc_left_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)667 void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
668 const uint8_t *above,
669 const uint8_t *left) {
670 (void)above;
671 __m128i sum_left = dc_sum_64(left);
672 const __m128i thirtytwo = _mm_set1_epi16(32);
673 sum_left = _mm_add_epi16(sum_left, thirtytwo);
674 sum_left = _mm_srai_epi16(sum_left, 6);
675 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
676 sum_left = _mm_shufflelo_epi16(sum_left, 0);
677 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
678 dc_store_32xh(&row, 64, dst, stride);
679 }
680
aom_dc_left_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)681 void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
682 const uint8_t *above,
683 const uint8_t *left) {
684 (void)above;
685 __m128i sum_left = dc_sum_64(left);
686 const __m128i thirtytwo = _mm_set1_epi16(32);
687 sum_left = _mm_add_epi16(sum_left, thirtytwo);
688 sum_left = _mm_srai_epi16(sum_left, 6);
689 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
690 sum_left = _mm_shufflelo_epi16(sum_left, 0);
691 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
692 dc_store_64xh(&row, 64, dst, stride);
693 }
694
aom_dc_left_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)695 void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
696 const uint8_t *above,
697 const uint8_t *left) {
698 (void)above;
699 __m128i sum_left = dc_sum_32_sse2(left);
700 const __m128i sixteen = _mm_set1_epi16(16);
701 sum_left = _mm_add_epi16(sum_left, sixteen);
702 sum_left = _mm_srai_epi16(sum_left, 5);
703 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
704 sum_left = _mm_shufflelo_epi16(sum_left, 0);
705 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
706 dc_store_64xh(&row, 32, dst, stride);
707 }
708
aom_dc_left_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)709 void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
710 const uint8_t *above,
711 const uint8_t *left) {
712 (void)above;
713 __m128i sum_left = dc_sum_16_sse2(left);
714 const __m128i eight = _mm_set1_epi16(8);
715 sum_left = _mm_add_epi16(sum_left, eight);
716 sum_left = _mm_srai_epi16(sum_left, 4);
717 sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
718 sum_left = _mm_shufflelo_epi16(sum_left, 0);
719 const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
720 dc_store_64xh(&row, 16, dst, stride);
721 }
722
723 // -----------------------------------------------------------------------------
724 // DC_128
725
aom_dc_128_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)726 void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
727 const uint8_t *above, const uint8_t *left) {
728 (void)above;
729 (void)left;
730 const uint32_t pred = 0x80808080;
731 dc_store_4xh(pred, 8, dst, stride);
732 }
733
aom_dc_128_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)734 void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
735 const uint8_t *above, const uint8_t *left) {
736 (void)above;
737 (void)left;
738 const uint32_t pred = 0x80808080;
739 dc_store_4xh(pred, 16, dst, stride);
740 }
741
aom_dc_128_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)742 void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
743 const uint8_t *above, const uint8_t *left) {
744 (void)above;
745 (void)left;
746 const __m128i row = _mm_set1_epi8((int8_t)128);
747 dc_store_8xh(&row, 4, dst, stride);
748 }
749
aom_dc_128_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)750 void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
751 const uint8_t *above, const uint8_t *left) {
752 (void)above;
753 (void)left;
754 const __m128i row = _mm_set1_epi8((int8_t)128);
755 dc_store_8xh(&row, 16, dst, stride);
756 }
757
aom_dc_128_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)758 void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
759 const uint8_t *above, const uint8_t *left) {
760 (void)above;
761 (void)left;
762 const __m128i row = _mm_set1_epi8((int8_t)128);
763 dc_store_8xh(&row, 32, dst, stride);
764 }
765
aom_dc_128_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)766 void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
767 const uint8_t *above, const uint8_t *left) {
768 (void)above;
769 (void)left;
770 const __m128i row = _mm_set1_epi8((int8_t)128);
771 dc_store_16xh(&row, 4, dst, stride);
772 }
773
aom_dc_128_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)774 void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
775 const uint8_t *above, const uint8_t *left) {
776 (void)above;
777 (void)left;
778 const __m128i row = _mm_set1_epi8((int8_t)128);
779 dc_store_16xh(&row, 8, dst, stride);
780 }
781
aom_dc_128_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)782 void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
783 const uint8_t *above,
784 const uint8_t *left) {
785 (void)above;
786 (void)left;
787 const __m128i row = _mm_set1_epi8((int8_t)128);
788 dc_store_16xh(&row, 32, dst, stride);
789 }
790
aom_dc_128_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)791 void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
792 const uint8_t *above,
793 const uint8_t *left) {
794 (void)above;
795 (void)left;
796 const __m128i row = _mm_set1_epi8((int8_t)128);
797 dc_store_16xh(&row, 64, dst, stride);
798 }
799
aom_dc_128_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)800 void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
801 const uint8_t *above, const uint8_t *left) {
802 (void)above;
803 (void)left;
804 const __m128i row = _mm_set1_epi8((int8_t)128);
805 dc_store_32xh(&row, 8, dst, stride);
806 }
807
aom_dc_128_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)808 void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
809 const uint8_t *above,
810 const uint8_t *left) {
811 (void)above;
812 (void)left;
813 const __m128i row = _mm_set1_epi8((int8_t)128);
814 dc_store_32xh(&row, 16, dst, stride);
815 }
816
aom_dc_128_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)817 void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
818 const uint8_t *above,
819 const uint8_t *left) {
820 (void)above;
821 (void)left;
822 const __m128i row = _mm_set1_epi8((int8_t)128);
823 dc_store_32xh(&row, 64, dst, stride);
824 }
825
aom_dc_128_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)826 void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
827 const uint8_t *above,
828 const uint8_t *left) {
829 (void)above;
830 (void)left;
831 const __m128i row = _mm_set1_epi8((int8_t)128);
832 dc_store_64xh(&row, 64, dst, stride);
833 }
834
aom_dc_128_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)835 void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
836 const uint8_t *above,
837 const uint8_t *left) {
838 (void)above;
839 (void)left;
840 const __m128i row = _mm_set1_epi8((int8_t)128);
841 dc_store_64xh(&row, 32, dst, stride);
842 }
843
aom_dc_128_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)844 void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
845 const uint8_t *above,
846 const uint8_t *left) {
847 (void)above;
848 (void)left;
849 const __m128i row = _mm_set1_epi8((int8_t)128);
850 dc_store_64xh(&row, 16, dst, stride);
851 }
852
853 // -----------------------------------------------------------------------------
854 // V_PRED
855
aom_v_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)856 void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
857 const uint8_t *above, const uint8_t *left) {
858 const uint32_t pred = *(uint32_t *)above;
859 (void)left;
860 dc_store_4xh(pred, 8, dst, stride);
861 }
862
aom_v_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)863 void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
864 const uint8_t *above, const uint8_t *left) {
865 const uint32_t pred = *(uint32_t *)above;
866 (void)left;
867 dc_store_4xh(pred, 16, dst, stride);
868 }
869
aom_v_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)870 void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
871 const uint8_t *above, const uint8_t *left) {
872 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
873 (void)left;
874 dc_store_8xh(&row, 4, dst, stride);
875 }
876
aom_v_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)877 void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
878 const uint8_t *above, const uint8_t *left) {
879 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
880 (void)left;
881 dc_store_8xh(&row, 16, dst, stride);
882 }
883
aom_v_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)884 void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
885 const uint8_t *above, const uint8_t *left) {
886 const __m128i row = _mm_loadl_epi64((__m128i const *)above);
887 (void)left;
888 dc_store_8xh(&row, 32, dst, stride);
889 }
890
aom_v_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)891 void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
892 const uint8_t *above, const uint8_t *left) {
893 const __m128i row = _mm_load_si128((__m128i const *)above);
894 (void)left;
895 dc_store_16xh(&row, 4, dst, stride);
896 }
897
aom_v_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)898 void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
899 const uint8_t *above, const uint8_t *left) {
900 const __m128i row = _mm_load_si128((__m128i const *)above);
901 (void)left;
902 dc_store_16xh(&row, 8, dst, stride);
903 }
904
aom_v_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)905 void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
906 const uint8_t *above, const uint8_t *left) {
907 const __m128i row = _mm_load_si128((__m128i const *)above);
908 (void)left;
909 dc_store_16xh(&row, 32, dst, stride);
910 }
911
aom_v_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)912 void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
913 const uint8_t *above, const uint8_t *left) {
914 const __m128i row = _mm_load_si128((__m128i const *)above);
915 (void)left;
916 dc_store_16xh(&row, 64, dst, stride);
917 }
918
v_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)919 static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
920 const uint8_t *above, int height) {
921 const __m128i row0 = _mm_load_si128((__m128i const *)above);
922 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
923 for (int i = 0; i < height; ++i) {
924 _mm_store_si128((__m128i *)dst, row0);
925 _mm_store_si128((__m128i *)(dst + 16), row1);
926 dst += stride;
927 }
928 }
929
aom_v_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)930 void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
931 const uint8_t *above, const uint8_t *left) {
932 (void)left;
933 v_predictor_32xh(dst, stride, above, 8);
934 }
935
aom_v_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)936 void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
937 const uint8_t *above, const uint8_t *left) {
938 (void)left;
939 v_predictor_32xh(dst, stride, above, 16);
940 }
941
aom_v_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)942 void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
943 const uint8_t *above, const uint8_t *left) {
944 (void)left;
945 v_predictor_32xh(dst, stride, above, 64);
946 }
947
v_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int height)948 static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
949 const uint8_t *above, int height) {
950 const __m128i row0 = _mm_load_si128((__m128i const *)above);
951 const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
952 const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
953 const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
954 for (int i = 0; i < height; ++i) {
955 _mm_store_si128((__m128i *)dst, row0);
956 _mm_store_si128((__m128i *)(dst + 16), row1);
957 _mm_store_si128((__m128i *)(dst + 32), row2);
958 _mm_store_si128((__m128i *)(dst + 48), row3);
959 dst += stride;
960 }
961 }
962
aom_v_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)963 void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
964 const uint8_t *above, const uint8_t *left) {
965 (void)left;
966 v_predictor_64xh(dst, stride, above, 64);
967 }
968
aom_v_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)969 void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
970 const uint8_t *above, const uint8_t *left) {
971 (void)left;
972 v_predictor_64xh(dst, stride, above, 32);
973 }
974
aom_v_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)975 void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
976 const uint8_t *above, const uint8_t *left) {
977 (void)left;
978 v_predictor_64xh(dst, stride, above, 16);
979 }
980
981 // -----------------------------------------------------------------------------
982 // H_PRED
983
aom_h_predictor_4x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)984 void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
985 const uint8_t *above, const uint8_t *left) {
986 (void)above;
987 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
988 left_col = _mm_unpacklo_epi8(left_col, left_col);
989 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
990 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
991 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
992 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
993 *(int *)dst = _mm_cvtsi128_si32(row0);
994 dst += stride;
995 *(int *)dst = _mm_cvtsi128_si32(row1);
996 dst += stride;
997 *(int *)dst = _mm_cvtsi128_si32(row2);
998 dst += stride;
999 *(int *)dst = _mm_cvtsi128_si32(row3);
1000 dst += stride;
1001 left_col = _mm_unpackhi_epi64(left_col, left_col);
1002 row0 = _mm_shufflelo_epi16(left_col, 0);
1003 row1 = _mm_shufflelo_epi16(left_col, 0x55);
1004 row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1005 row3 = _mm_shufflelo_epi16(left_col, 0xff);
1006 *(int *)dst = _mm_cvtsi128_si32(row0);
1007 dst += stride;
1008 *(int *)dst = _mm_cvtsi128_si32(row1);
1009 dst += stride;
1010 *(int *)dst = _mm_cvtsi128_si32(row2);
1011 dst += stride;
1012 *(int *)dst = _mm_cvtsi128_si32(row3);
1013 }
1014
aom_h_predictor_4x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1015 void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
1016 const uint8_t *above, const uint8_t *left) {
1017 (void)above;
1018 const __m128i left_col = _mm_load_si128((__m128i const *)left);
1019 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1020 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1021
1022 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1023 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1024 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1025 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1026 *(int *)dst = _mm_cvtsi128_si32(row0);
1027 dst += stride;
1028 *(int *)dst = _mm_cvtsi128_si32(row1);
1029 dst += stride;
1030 *(int *)dst = _mm_cvtsi128_si32(row2);
1031 dst += stride;
1032 *(int *)dst = _mm_cvtsi128_si32(row3);
1033 dst += stride;
1034
1035 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1036 row0 = _mm_shufflelo_epi16(left_col_low, 0);
1037 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1038 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1039 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1040 *(int *)dst = _mm_cvtsi128_si32(row0);
1041 dst += stride;
1042 *(int *)dst = _mm_cvtsi128_si32(row1);
1043 dst += stride;
1044 *(int *)dst = _mm_cvtsi128_si32(row2);
1045 dst += stride;
1046 *(int *)dst = _mm_cvtsi128_si32(row3);
1047 dst += stride;
1048
1049 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1050 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1051 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1052 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1053 *(int *)dst = _mm_cvtsi128_si32(row0);
1054 dst += stride;
1055 *(int *)dst = _mm_cvtsi128_si32(row1);
1056 dst += stride;
1057 *(int *)dst = _mm_cvtsi128_si32(row2);
1058 dst += stride;
1059 *(int *)dst = _mm_cvtsi128_si32(row3);
1060 dst += stride;
1061
1062 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1063 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1064 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1065 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1066 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1067 *(int *)dst = _mm_cvtsi128_si32(row0);
1068 dst += stride;
1069 *(int *)dst = _mm_cvtsi128_si32(row1);
1070 dst += stride;
1071 *(int *)dst = _mm_cvtsi128_si32(row2);
1072 dst += stride;
1073 *(int *)dst = _mm_cvtsi128_si32(row3);
1074 }
1075
aom_h_predictor_8x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1076 void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
1077 const uint8_t *above, const uint8_t *left) {
1078 (void)above;
1079 __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
1080 left_col = _mm_unpacklo_epi8(left_col, left_col);
1081 __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
1082 __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
1083 __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
1084 __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
1085 _mm_storel_epi64((__m128i *)dst, row0);
1086 dst += stride;
1087 _mm_storel_epi64((__m128i *)dst, row1);
1088 dst += stride;
1089 _mm_storel_epi64((__m128i *)dst, row2);
1090 dst += stride;
1091 _mm_storel_epi64((__m128i *)dst, row3);
1092 }
1093
h_predictor_8x16xc(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int count)1094 static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
1095 const uint8_t *above, const uint8_t *left,
1096 int count) {
1097 (void)above;
1098 for (int i = 0; i < count; ++i) {
1099 const __m128i left_col = _mm_load_si128((__m128i const *)left);
1100 __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
1101 __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
1102
1103 __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
1104 __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1105 __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1106 __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1107 _mm_storel_epi64((__m128i *)dst, row0);
1108 dst += stride;
1109 _mm_storel_epi64((__m128i *)dst, row1);
1110 dst += stride;
1111 _mm_storel_epi64((__m128i *)dst, row2);
1112 dst += stride;
1113 _mm_storel_epi64((__m128i *)dst, row3);
1114 dst += stride;
1115
1116 left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
1117 row0 = _mm_shufflelo_epi16(left_col_low, 0);
1118 row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
1119 row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
1120 row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
1121 _mm_storel_epi64((__m128i *)dst, row0);
1122 dst += stride;
1123 _mm_storel_epi64((__m128i *)dst, row1);
1124 dst += stride;
1125 _mm_storel_epi64((__m128i *)dst, row2);
1126 dst += stride;
1127 _mm_storel_epi64((__m128i *)dst, row3);
1128 dst += stride;
1129
1130 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1131 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1132 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1133 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1134 _mm_storel_epi64((__m128i *)dst, row0);
1135 dst += stride;
1136 _mm_storel_epi64((__m128i *)dst, row1);
1137 dst += stride;
1138 _mm_storel_epi64((__m128i *)dst, row2);
1139 dst += stride;
1140 _mm_storel_epi64((__m128i *)dst, row3);
1141 dst += stride;
1142
1143 left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
1144 row0 = _mm_shufflelo_epi16(left_col_high, 0);
1145 row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
1146 row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
1147 row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
1148 _mm_storel_epi64((__m128i *)dst, row0);
1149 dst += stride;
1150 _mm_storel_epi64((__m128i *)dst, row1);
1151 dst += stride;
1152 _mm_storel_epi64((__m128i *)dst, row2);
1153 dst += stride;
1154 _mm_storel_epi64((__m128i *)dst, row3);
1155 dst += stride;
1156 left += 16;
1157 }
1158 }
1159
aom_h_predictor_8x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1160 void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
1161 const uint8_t *above, const uint8_t *left) {
1162 h_predictor_8x16xc(dst, stride, above, left, 1);
1163 }
1164
aom_h_predictor_8x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1165 void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
1166 const uint8_t *above, const uint8_t *left) {
1167 h_predictor_8x16xc(dst, stride, above, left, 2);
1168 }
1169
h_pred_store_16xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1170 static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
1171 ptrdiff_t stride) {
1172 int i;
1173 for (i = 0; i < h; ++i) {
1174 _mm_store_si128((__m128i *)dst, row[i]);
1175 dst += stride;
1176 }
1177 }
1178
repeat_low_4pixels(const __m128i * x,__m128i * row)1179 static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
1180 const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
1181 const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
1182 const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
1183 const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
1184
1185 row[0] = _mm_unpacklo_epi64(u0, u0);
1186 row[1] = _mm_unpacklo_epi64(u1, u1);
1187 row[2] = _mm_unpacklo_epi64(u2, u2);
1188 row[3] = _mm_unpacklo_epi64(u3, u3);
1189 }
1190
repeat_high_4pixels(const __m128i * x,__m128i * row)1191 static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
1192 const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
1193 const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
1194 const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
1195 const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
1196
1197 row[0] = _mm_unpackhi_epi64(u0, u0);
1198 row[1] = _mm_unpackhi_epi64(u1, u1);
1199 row[2] = _mm_unpackhi_epi64(u2, u2);
1200 row[3] = _mm_unpackhi_epi64(u3, u3);
1201 }
1202
1203 // Process 16x8, first 4 rows
1204 // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_16x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1205 static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
1206 ptrdiff_t stride) {
1207 __m128i row[4];
1208 repeat_low_4pixels(left, row);
1209 h_pred_store_16xh(row, 4, dst, stride);
1210 }
1211
1212 // Process 16x8, second 4 rows
1213 // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_16x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1214 static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
1215 ptrdiff_t stride) {
1216 __m128i row[4];
1217 repeat_high_4pixels(left, row);
1218 h_pred_store_16xh(row, 4, dst, stride);
1219 }
1220
aom_h_predictor_16x4_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1221 void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
1222 const uint8_t *above, const uint8_t *left) {
1223 (void)above;
1224 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1225 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1226 h_prediction_16x8_1(&left_col_8p, dst, stride);
1227 }
1228
aom_h_predictor_16x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1229 void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
1230 const uint8_t *above, const uint8_t *left) {
1231 (void)above;
1232 const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
1233 const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1234 h_prediction_16x8_1(&left_col_8p, dst, stride);
1235 dst += stride << 2;
1236 h_prediction_16x8_2(&left_col_8p, dst, stride);
1237 }
1238
h_predictor_16xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int count)1239 static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
1240 const uint8_t *left, int count) {
1241 int i = 0;
1242 do {
1243 const __m128i left_col = _mm_load_si128((const __m128i *)left);
1244 const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
1245 h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
1246 dst += stride << 2;
1247 h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
1248 dst += stride << 2;
1249
1250 const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
1251 h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
1252 dst += stride << 2;
1253 h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
1254 dst += stride << 2;
1255
1256 left += 16;
1257 i++;
1258 } while (i < count);
1259 }
1260
aom_h_predictor_16x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1261 void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
1262 const uint8_t *above, const uint8_t *left) {
1263 (void)above;
1264 h_predictor_16xh(dst, stride, left, 2);
1265 }
1266
aom_h_predictor_16x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1267 void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
1268 const uint8_t *above, const uint8_t *left) {
1269 (void)above;
1270 h_predictor_16xh(dst, stride, left, 4);
1271 }
1272
h_pred_store_32xh(const __m128i * row,int h,uint8_t * dst,ptrdiff_t stride)1273 static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
1274 ptrdiff_t stride) {
1275 int i;
1276 for (i = 0; i < h; ++i) {
1277 _mm_store_si128((__m128i *)dst, row[i]);
1278 _mm_store_si128((__m128i *)(dst + 16), row[i]);
1279 dst += stride;
1280 }
1281 }
1282
1283 // Process 32x8, first 4 rows
1284 // Use first 8 bytes of left register: xxxxxxxx33221100
h_prediction_32x8_1(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1285 static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
1286 ptrdiff_t stride) {
1287 __m128i row[4];
1288 repeat_low_4pixels(left, row);
1289 h_pred_store_32xh(row, 4, dst, stride);
1290 }
1291
1292 // Process 32x8, second 4 rows
1293 // Use second 8 bytes of left register: 77665544xxxxxxxx
h_prediction_32x8_2(const __m128i * left,uint8_t * dst,ptrdiff_t stride)1294 static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
1295 ptrdiff_t stride) {
1296 __m128i row[4];
1297 repeat_high_4pixels(left, row);
1298 h_pred_store_32xh(row, 4, dst, stride);
1299 }
1300
aom_h_predictor_32x8_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1301 void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
1302 const uint8_t *above, const uint8_t *left) {
1303 __m128i left_col, left_col_8p;
1304 (void)above;
1305
1306 left_col = _mm_load_si128((const __m128i *)left);
1307
1308 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1309 h_prediction_32x8_1(&left_col_8p, dst, stride);
1310 dst += stride << 2;
1311 h_prediction_32x8_2(&left_col_8p, dst, stride);
1312 }
1313
aom_h_predictor_32x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1314 void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
1315 const uint8_t *above, const uint8_t *left) {
1316 __m128i left_col, left_col_8p;
1317 (void)above;
1318
1319 left_col = _mm_load_si128((const __m128i *)left);
1320
1321 left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
1322 h_prediction_32x8_1(&left_col_8p, dst, stride);
1323 dst += stride << 2;
1324 h_prediction_32x8_2(&left_col_8p, dst, stride);
1325 dst += stride << 2;
1326
1327 left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
1328 h_prediction_32x8_1(&left_col_8p, dst, stride);
1329 dst += stride << 2;
1330 h_prediction_32x8_2(&left_col_8p, dst, stride);
1331 }
1332
h_predictor_32xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1333 static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
1334 const uint8_t *left, int height) {
1335 int i = height >> 2;
1336 do {
1337 __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1338 left4 = _mm_unpacklo_epi8(left4, left4);
1339 left4 = _mm_unpacklo_epi8(left4, left4);
1340 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1341 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1342 _mm_store_si128((__m128i *)dst, r0);
1343 _mm_store_si128((__m128i *)(dst + 16), r0);
1344 _mm_store_si128((__m128i *)(dst + stride), r1);
1345 _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1346 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1347 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1348 _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1349 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1350 _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1351 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1352 left += 4;
1353 dst += stride * 4;
1354 } while (--i);
1355 }
1356
aom_h_predictor_32x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1357 void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
1358 const uint8_t *above, const uint8_t *left) {
1359 (void)above;
1360 h_predictor_32xh(dst, stride, left, 64);
1361 }
1362
h_predictor_64xh(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int height)1363 static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
1364 const uint8_t *left, int height) {
1365 int i = height >> 2;
1366 do {
1367 __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
1368 left4 = _mm_unpacklo_epi8(left4, left4);
1369 left4 = _mm_unpacklo_epi8(left4, left4);
1370 const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
1371 const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
1372 _mm_store_si128((__m128i *)dst, r0);
1373 _mm_store_si128((__m128i *)(dst + 16), r0);
1374 _mm_store_si128((__m128i *)(dst + 32), r0);
1375 _mm_store_si128((__m128i *)(dst + 48), r0);
1376 _mm_store_si128((__m128i *)(dst + stride), r1);
1377 _mm_store_si128((__m128i *)(dst + stride + 16), r1);
1378 _mm_store_si128((__m128i *)(dst + stride + 32), r1);
1379 _mm_store_si128((__m128i *)(dst + stride + 48), r1);
1380 const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
1381 const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
1382 _mm_store_si128((__m128i *)(dst + stride * 2), r2);
1383 _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
1384 _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
1385 _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
1386 _mm_store_si128((__m128i *)(dst + stride * 3), r3);
1387 _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
1388 _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
1389 _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
1390 left += 4;
1391 dst += stride * 4;
1392 } while (--i);
1393 }
1394
aom_h_predictor_64x64_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1395 void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
1396 const uint8_t *above, const uint8_t *left) {
1397 (void)above;
1398 h_predictor_64xh(dst, stride, left, 64);
1399 }
1400
aom_h_predictor_64x32_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1401 void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
1402 const uint8_t *above, const uint8_t *left) {
1403 (void)above;
1404 h_predictor_64xh(dst, stride, left, 32);
1405 }
1406
aom_h_predictor_64x16_sse2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1407 void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
1408 const uint8_t *above, const uint8_t *left) {
1409 (void)above;
1410 h_predictor_64xh(dst, stride, left, 16);
1411 }
1412