1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
16 // -----------------------------------------------------------------------------
17 // H_PRED
18
aom_highbd_h_predictor_4x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)19 void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
20 const uint16_t *above,
21 const uint16_t *left, int bd) {
22 const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
23 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
24 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
25 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
26 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
27 (void)above;
28 (void)bd;
29 _mm_storel_epi64((__m128i *)dst, row0);
30 dst += stride;
31 _mm_storel_epi64((__m128i *)dst, row1);
32 dst += stride;
33 _mm_storel_epi64((__m128i *)dst, row2);
34 dst += stride;
35 _mm_storel_epi64((__m128i *)dst, row3);
36 }
37
aom_highbd_h_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)38 void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
39 const uint16_t *above,
40 const uint16_t *left, int bd) {
41 aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
42 dst += stride << 2;
43 left += 4;
44 aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
45 }
46
aom_highbd_h_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)47 void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
48 const uint16_t *above,
49 const uint16_t *left, int bd) {
50 const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
51 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
52 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
53 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
54 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
55 (void)above;
56 (void)bd;
57 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
58 dst += stride;
59 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
60 dst += stride;
61 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
62 dst += stride;
63 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
64 }
65
aom_highbd_h_predictor_8x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)66 void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
67 const uint16_t *above,
68 const uint16_t *left, int bd) {
69 const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
70 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
71 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
72 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
73 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
74 const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
75 const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
76 const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
77 const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
78 (void)above;
79 (void)bd;
80 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
81 dst += stride;
82 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
83 dst += stride;
84 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
85 dst += stride;
86 _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
87 dst += stride;
88 _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
89 dst += stride;
90 _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
91 dst += stride;
92 _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
93 dst += stride;
94 _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
95 }
96
aom_highbd_h_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)97 void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
98 const uint16_t *above,
99 const uint16_t *left, int bd) {
100 aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
101 dst += stride << 3;
102 left += 8;
103 aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
104 }
105
h_store_16_unpacklo(uint16_t ** dst,const ptrdiff_t stride,const __m128i * row)106 static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
107 const __m128i *row) {
108 const __m128i val = _mm_unpacklo_epi64(*row, *row);
109 _mm_store_si128((__m128i *)*dst, val);
110 _mm_store_si128((__m128i *)(*dst + 8), val);
111 *dst += stride;
112 }
113
h_store_16_unpackhi(uint16_t ** dst,const ptrdiff_t stride,const __m128i * row)114 static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
115 const __m128i *row) {
116 const __m128i val = _mm_unpackhi_epi64(*row, *row);
117 _mm_store_si128((__m128i *)(*dst), val);
118 _mm_store_si128((__m128i *)(*dst + 8), val);
119 *dst += stride;
120 }
121
h_predictor_16x8(uint16_t * dst,ptrdiff_t stride,const uint16_t * left)122 static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
123 const uint16_t *left) {
124 const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
125 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
126 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
127 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
128 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
129 const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
130 const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
131 const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
132 const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
133 h_store_16_unpacklo(&dst, stride, &row0);
134 h_store_16_unpacklo(&dst, stride, &row1);
135 h_store_16_unpacklo(&dst, stride, &row2);
136 h_store_16_unpacklo(&dst, stride, &row3);
137 h_store_16_unpackhi(&dst, stride, &row4);
138 h_store_16_unpackhi(&dst, stride, &row5);
139 h_store_16_unpackhi(&dst, stride, &row6);
140 h_store_16_unpackhi(&dst, stride, &row7);
141 }
142
aom_highbd_h_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)143 void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
144 const uint16_t *above,
145 const uint16_t *left, int bd) {
146 (void)above;
147 (void)bd;
148 h_predictor_16x8(dst, stride, left);
149 }
150
aom_highbd_h_predictor_16x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)151 void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
152 const uint16_t *above,
153 const uint16_t *left, int bd) {
154 int i;
155 (void)above;
156 (void)bd;
157
158 for (i = 0; i < 2; i++, left += 8) {
159 h_predictor_16x8(dst, stride, left);
160 dst += stride << 3;
161 }
162 }
163
aom_highbd_h_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)164 void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
165 const uint16_t *above,
166 const uint16_t *left, int bd) {
167 int i;
168 (void)above;
169 (void)bd;
170
171 for (i = 0; i < 4; i++, left += 8) {
172 h_predictor_16x8(dst, stride, left);
173 dst += stride << 3;
174 }
175 }
176
h_store_32_unpacklo(uint16_t ** dst,const ptrdiff_t stride,const __m128i * row)177 static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
178 const __m128i *row) {
179 const __m128i val = _mm_unpacklo_epi64(*row, *row);
180 _mm_store_si128((__m128i *)(*dst), val);
181 _mm_store_si128((__m128i *)(*dst + 8), val);
182 _mm_store_si128((__m128i *)(*dst + 16), val);
183 _mm_store_si128((__m128i *)(*dst + 24), val);
184 *dst += stride;
185 }
186
h_store_32_unpackhi(uint16_t ** dst,const ptrdiff_t stride,const __m128i * row)187 static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
188 const __m128i *row) {
189 const __m128i val = _mm_unpackhi_epi64(*row, *row);
190 _mm_store_si128((__m128i *)(*dst), val);
191 _mm_store_si128((__m128i *)(*dst + 8), val);
192 _mm_store_si128((__m128i *)(*dst + 16), val);
193 _mm_store_si128((__m128i *)(*dst + 24), val);
194 *dst += stride;
195 }
196
h_predictor_32x8(uint16_t * dst,ptrdiff_t stride,const uint16_t * left)197 static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
198 const uint16_t *left) {
199 const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
200 const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
201 const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
202 const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
203 const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
204 const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
205 const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
206 const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
207 const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
208 h_store_32_unpacklo(&dst, stride, &row0);
209 h_store_32_unpacklo(&dst, stride, &row1);
210 h_store_32_unpacklo(&dst, stride, &row2);
211 h_store_32_unpacklo(&dst, stride, &row3);
212 h_store_32_unpackhi(&dst, stride, &row4);
213 h_store_32_unpackhi(&dst, stride, &row5);
214 h_store_32_unpackhi(&dst, stride, &row6);
215 h_store_32_unpackhi(&dst, stride, &row7);
216 }
217
aom_highbd_h_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)218 void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
219 const uint16_t *above,
220 const uint16_t *left, int bd) {
221 int i;
222 (void)above;
223 (void)bd;
224
225 for (i = 0; i < 2; i++, left += 8) {
226 h_predictor_32x8(dst, stride, left);
227 dst += stride << 3;
228 }
229 }
230
aom_highbd_h_predictor_32x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)231 void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
232 const uint16_t *above,
233 const uint16_t *left, int bd) {
234 int i;
235 (void)above;
236 (void)bd;
237
238 for (i = 0; i < 4; i++, left += 8) {
239 h_predictor_32x8(dst, stride, left);
240 dst += stride << 3;
241 }
242 }
243
244 // -----------------------------------------------------------------------------
245 // DC_TOP, DC_LEFT, DC_128
246
247 // 4x4
248
dc_sum_4(const uint16_t * ref)249 static INLINE __m128i dc_sum_4(const uint16_t *ref) {
250 const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
251 const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
252 const __m128i a = _mm_add_epi16(_dcba, _xxdc);
253 return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
254 }
255
dc_store_4x4(uint16_t * dst,ptrdiff_t stride,const __m128i * dc)256 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
257 const __m128i *dc) {
258 const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
259 int i;
260 for (i = 0; i < 4; ++i, dst += stride) {
261 _mm_storel_epi64((__m128i *)dst, dc_dup);
262 }
263 }
264
aom_highbd_dc_left_predictor_4x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)265 void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
266 const uint16_t *above,
267 const uint16_t *left, int bd) {
268 const __m128i two = _mm_cvtsi32_si128(2);
269 const __m128i sum = dc_sum_4(left);
270 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
271 (void)above;
272 (void)bd;
273 dc_store_4x4(dst, stride, &dc);
274 }
275
aom_highbd_dc_top_predictor_4x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)276 void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
277 const uint16_t *above,
278 const uint16_t *left, int bd) {
279 const __m128i two = _mm_cvtsi32_si128(2);
280 const __m128i sum = dc_sum_4(above);
281 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
282 (void)left;
283 (void)bd;
284 dc_store_4x4(dst, stride, &dc);
285 }
286
aom_highbd_dc_128_predictor_4x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)287 void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
288 const uint16_t *above,
289 const uint16_t *left, int bd) {
290 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
291 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
292 (void)above;
293 (void)left;
294 dc_store_4x4(dst, stride, &dc_dup);
295 }
296
297 // -----------------------------------------------------------------------------
298 // 4x8
299
dc_store_4x8(uint16_t * dst,ptrdiff_t stride,const __m128i * dc)300 static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
301 const __m128i *dc) {
302 const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
303 int i;
304 for (i = 0; i < 8; ++i, dst += stride) {
305 _mm_storel_epi64((__m128i *)dst, dc_dup);
306 }
307 }
308
309 // Shared with DC 8xh
dc_sum_8(const uint16_t * ref)310 static INLINE __m128i dc_sum_8(const uint16_t *ref) {
311 const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
312 const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
313 const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
314 const __m128i a = _mm_add_epi16(_dcba, _xxdc);
315
316 return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
317 }
318
aom_highbd_dc_left_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)319 void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
320 const uint16_t *above,
321 const uint16_t *left, int bd) {
322 const __m128i sum = dc_sum_8(left);
323 const __m128i four = _mm_cvtsi32_si128(4);
324 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
325 (void)above;
326 (void)bd;
327 dc_store_4x8(dst, stride, &dc);
328 }
329
aom_highbd_dc_top_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)330 void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
331 const uint16_t *above,
332 const uint16_t *left, int bd) {
333 const __m128i two = _mm_cvtsi32_si128(2);
334 const __m128i sum = dc_sum_4(above);
335 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
336 (void)left;
337 (void)bd;
338 dc_store_4x8(dst, stride, &dc);
339 }
340
aom_highbd_dc_128_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)341 void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
342 const uint16_t *above,
343 const uint16_t *left, int bd) {
344 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
345 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
346 (void)above;
347 (void)left;
348 dc_store_4x8(dst, stride, &dc_dup);
349 }
350
351 // -----------------------------------------------------------------------------
352 // 8xh
353
dc_store_8xh(uint16_t * dst,ptrdiff_t stride,int height,const __m128i * dc)354 static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
355 const __m128i *dc) {
356 const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
357 const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
358 int i;
359 for (i = 0; i < height; ++i, dst += stride) {
360 _mm_store_si128((__m128i *)dst, dc_dup);
361 }
362 }
363
364 // -----------------------------------------------------------------------------
365 // DC_TOP
366
dc_top_predictor_8xh(uint16_t * dst,ptrdiff_t stride,int height,const uint16_t * above)367 static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
368 int height, const uint16_t *above) {
369 const __m128i four = _mm_cvtsi32_si128(4);
370 const __m128i sum = dc_sum_8(above);
371 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
372 dc_store_8xh(dst, stride, height, &dc);
373 }
374
aom_highbd_dc_top_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)375 void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
376 const uint16_t *above,
377 const uint16_t *left, int bd) {
378 (void)left;
379 (void)bd;
380 dc_top_predictor_8xh(dst, stride, 4, above);
381 }
382
aom_highbd_dc_top_predictor_8x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)383 void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
384 const uint16_t *above,
385 const uint16_t *left, int bd) {
386 (void)left;
387 (void)bd;
388 dc_top_predictor_8xh(dst, stride, 8, above);
389 }
390
aom_highbd_dc_top_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)391 void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
392 const uint16_t *above,
393 const uint16_t *left, int bd) {
394 (void)left;
395 (void)bd;
396 dc_top_predictor_8xh(dst, stride, 16, above);
397 }
398
399 // -----------------------------------------------------------------------------
400 // DC_LEFT
401
aom_highbd_dc_left_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)402 void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
403 const uint16_t *above,
404 const uint16_t *left, int bd) {
405 const __m128i two = _mm_cvtsi32_si128(2);
406 const __m128i sum = dc_sum_4(left);
407 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
408 (void)above;
409 (void)bd;
410 dc_store_8xh(dst, stride, 4, &dc);
411 }
412
aom_highbd_dc_left_predictor_8x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)413 void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
414 const uint16_t *above,
415 const uint16_t *left, int bd) {
416 const __m128i four = _mm_cvtsi32_si128(4);
417 const __m128i sum = dc_sum_8(left);
418 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
419 (void)above;
420 (void)bd;
421 dc_store_8xh(dst, stride, 8, &dc);
422 }
423
424 // Shared with DC 16xh
dc_sum_16(const uint16_t * ref)425 static INLINE __m128i dc_sum_16(const uint16_t *ref) {
426 const __m128i sum_lo = dc_sum_8(ref);
427 const __m128i sum_hi = dc_sum_8(ref + 8);
428 return _mm_add_epi16(sum_lo, sum_hi);
429 }
430
aom_highbd_dc_left_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)431 void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
432 const uint16_t *above,
433 const uint16_t *left, int bd) {
434 const __m128i eight = _mm_cvtsi32_si128(8);
435 const __m128i sum = dc_sum_16(left);
436 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
437 (void)above;
438 (void)bd;
439 dc_store_8xh(dst, stride, 16, &dc);
440 }
441
442 // -----------------------------------------------------------------------------
443 // DC_128
444
dc_128_predictor_8xh(uint16_t * dst,ptrdiff_t stride,int height,int bd)445 static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
446 int height, int bd) {
447 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
448 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
449 dc_store_8xh(dst, stride, height, &dc_dup);
450 }
451
aom_highbd_dc_128_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)452 void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
453 const uint16_t *above,
454 const uint16_t *left, int bd) {
455 (void)above;
456 (void)left;
457 dc_128_predictor_8xh(dst, stride, 4, bd);
458 }
459
aom_highbd_dc_128_predictor_8x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)460 void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
461 const uint16_t *above,
462 const uint16_t *left, int bd) {
463 (void)above;
464 (void)left;
465 dc_128_predictor_8xh(dst, stride, 8, bd);
466 }
467
aom_highbd_dc_128_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)468 void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
469 const uint16_t *above,
470 const uint16_t *left, int bd) {
471 (void)above;
472 (void)left;
473 dc_128_predictor_8xh(dst, stride, 16, bd);
474 }
475
476 // -----------------------------------------------------------------------------
477 // 16xh
478
dc_store_16xh(uint16_t * dst,ptrdiff_t stride,int height,const __m128i * dc)479 static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
480 const __m128i *dc) {
481 const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
482 const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
483 int i;
484 for (i = 0; i < height; ++i, dst += stride) {
485 _mm_store_si128((__m128i *)dst, dc_dup);
486 _mm_store_si128((__m128i *)(dst + 8), dc_dup);
487 }
488 }
489
490 // -----------------------------------------------------------------------------
491 // DC_LEFT
492
aom_highbd_dc_left_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)493 void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
494 const uint16_t *above,
495 const uint16_t *left, int bd) {
496 const __m128i four = _mm_cvtsi32_si128(4);
497 const __m128i sum = dc_sum_8(left);
498 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
499 (void)above;
500 (void)bd;
501 dc_store_16xh(dst, stride, 8, &dc);
502 }
503
aom_highbd_dc_left_predictor_16x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)504 void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
505 const uint16_t *above,
506 const uint16_t *left, int bd) {
507 const __m128i eight = _mm_cvtsi32_si128(8);
508 const __m128i sum = dc_sum_16(left);
509 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
510 (void)above;
511 (void)bd;
512 dc_store_16xh(dst, stride, 16, &dc);
513 }
514
515 // Shared with 32xh
dc_sum_32(const uint16_t * ref)516 static INLINE __m128i dc_sum_32(const uint16_t *ref) {
517 const __m128i zero = _mm_setzero_si128();
518 const __m128i sum_a = dc_sum_16(ref);
519 const __m128i sum_b = dc_sum_16(ref + 16);
520 // 12 bit bd will outrange, so expand to 32 bit before adding final total
521 return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
522 _mm_unpacklo_epi16(sum_b, zero));
523 }
524
aom_highbd_dc_left_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)525 void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
526 const uint16_t *above,
527 const uint16_t *left, int bd) {
528 const __m128i sixteen = _mm_cvtsi32_si128(16);
529 const __m128i sum = dc_sum_32(left);
530 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
531 (void)above;
532 (void)bd;
533 dc_store_16xh(dst, stride, 32, &dc);
534 }
535
536 // -----------------------------------------------------------------------------
537 // DC_TOP
538
aom_highbd_dc_top_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)539 void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
540 const uint16_t *above,
541 const uint16_t *left, int bd) {
542 const __m128i eight = _mm_cvtsi32_si128(8);
543 const __m128i sum = dc_sum_16(above);
544 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
545 (void)left;
546 (void)bd;
547 dc_store_16xh(dst, stride, 8, &dc);
548 }
549
aom_highbd_dc_top_predictor_16x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)550 void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
551 const uint16_t *above,
552 const uint16_t *left, int bd) {
553 const __m128i eight = _mm_cvtsi32_si128(8);
554 const __m128i sum = dc_sum_16(above);
555 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
556 (void)left;
557 (void)bd;
558 dc_store_16xh(dst, stride, 16, &dc);
559 }
560
aom_highbd_dc_top_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)561 void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
562 const uint16_t *above,
563 const uint16_t *left, int bd) {
564 const __m128i eight = _mm_cvtsi32_si128(8);
565 const __m128i sum = dc_sum_16(above);
566 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
567 (void)left;
568 (void)bd;
569 dc_store_16xh(dst, stride, 32, &dc);
570 }
571
572 // -----------------------------------------------------------------------------
573 // DC_128
574
aom_highbd_dc_128_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)575 void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
576 const uint16_t *above,
577 const uint16_t *left, int bd) {
578 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
579 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
580 (void)above;
581 (void)left;
582 dc_store_16xh(dst, stride, 8, &dc_dup);
583 }
584
aom_highbd_dc_128_predictor_16x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)585 void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
586 const uint16_t *above,
587 const uint16_t *left, int bd) {
588 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
589 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
590 (void)above;
591 (void)left;
592 dc_store_16xh(dst, stride, 16, &dc_dup);
593 }
594
aom_highbd_dc_128_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)595 void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
596 const uint16_t *above,
597 const uint16_t *left, int bd) {
598 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
599 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
600 (void)above;
601 (void)left;
602 dc_store_16xh(dst, stride, 32, &dc_dup);
603 }
604
605 // -----------------------------------------------------------------------------
606 // 32xh
607
dc_store_32xh(uint16_t * dst,ptrdiff_t stride,int height,const __m128i * dc)608 static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
609 const __m128i *dc) {
610 const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
611 const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
612 int i;
613 for (i = 0; i < height; ++i, dst += stride) {
614 _mm_store_si128((__m128i *)dst, dc_dup);
615 _mm_store_si128((__m128i *)(dst + 8), dc_dup);
616 _mm_store_si128((__m128i *)(dst + 16), dc_dup);
617 _mm_store_si128((__m128i *)(dst + 24), dc_dup);
618 }
619 }
620
aom_highbd_dc_left_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)621 void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
622 const uint16_t *above,
623 const uint16_t *left, int bd) {
624 const __m128i eight = _mm_cvtsi32_si128(8);
625 const __m128i sum = dc_sum_16(left);
626 const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
627 (void)above;
628 (void)bd;
629 dc_store_32xh(dst, stride, 16, &dc);
630 }
631
aom_highbd_dc_left_predictor_32x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)632 void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
633 const uint16_t *above,
634 const uint16_t *left, int bd) {
635 const __m128i sixteen = _mm_cvtsi32_si128(16);
636 const __m128i sum = dc_sum_32(left);
637 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
638 (void)above;
639 (void)bd;
640 dc_store_32xh(dst, stride, 32, &dc);
641 }
642
aom_highbd_dc_top_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)643 void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
644 const uint16_t *above,
645 const uint16_t *left, int bd) {
646 const __m128i sixteen = _mm_cvtsi32_si128(16);
647 const __m128i sum = dc_sum_32(above);
648 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
649 (void)left;
650 (void)bd;
651 dc_store_32xh(dst, stride, 16, &dc);
652 }
653
aom_highbd_dc_128_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)654 void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
655 const uint16_t *above,
656 const uint16_t *left, int bd) {
657 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
658 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
659 (void)above;
660 (void)left;
661 dc_store_32xh(dst, stride, 16, &dc_dup);
662 }
663
aom_highbd_dc_top_predictor_32x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)664 void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
665 const uint16_t *above,
666 const uint16_t *left, int bd) {
667 const __m128i sixteen = _mm_cvtsi32_si128(16);
668 const __m128i sum = dc_sum_32(above);
669 const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
670 (void)left;
671 (void)bd;
672 dc_store_32xh(dst, stride, 32, &dc);
673 }
674
aom_highbd_dc_128_predictor_32x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)675 void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
676 const uint16_t *above,
677 const uint16_t *left, int bd) {
678 const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
679 const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
680 (void)above;
681 (void)left;
682 dc_store_32xh(dst, stride, 32, &dc_dup);
683 }
684
685 // -----------------------------------------------------------------------------
686 // V_PRED
687
aom_highbd_v_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)688 void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
689 const uint16_t *above,
690 const uint16_t *left, int bd) {
691 (void)left;
692 (void)bd;
693 const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
694 int i;
695 for (i = 0; i < 2; ++i) {
696 _mm_storel_epi64((__m128i *)dst, above_u16);
697 _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
698 _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
699 _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
700 dst += stride << 2;
701 }
702 }
703
aom_highbd_v_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)704 void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
705 const uint16_t *above,
706 const uint16_t *left, int bd) {
707 (void)left;
708 (void)bd;
709 const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
710 _mm_store_si128((__m128i *)dst, above_u16);
711 _mm_store_si128((__m128i *)(dst + stride), above_u16);
712 _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
713 _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
714 }
715
aom_highbd_v_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)716 void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
717 const uint16_t *above,
718 const uint16_t *left, int bd) {
719 (void)left;
720 (void)bd;
721 const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
722 int i;
723 for (i = 0; i < 4; ++i) {
724 _mm_store_si128((__m128i *)dst, above_u16);
725 _mm_store_si128((__m128i *)(dst + stride), above_u16);
726 _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
727 _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
728 dst += stride << 2;
729 }
730 }
731
aom_highbd_v_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)732 void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
733 const uint16_t *above,
734 const uint16_t *left, int bd) {
735 (void)left;
736 (void)bd;
737 const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
738 const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
739 int i;
740 for (i = 0; i < 2; ++i) {
741 _mm_store_si128((__m128i *)dst, above0_u16);
742 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
743 dst += stride;
744 _mm_store_si128((__m128i *)dst, above0_u16);
745 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
746 dst += stride;
747 _mm_store_si128((__m128i *)dst, above0_u16);
748 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
749 dst += stride;
750 _mm_store_si128((__m128i *)dst, above0_u16);
751 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
752 dst += stride;
753 }
754 }
755
aom_highbd_v_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)756 void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
757 const uint16_t *above,
758 const uint16_t *left, int bd) {
759 (void)left;
760 (void)bd;
761 const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
762 const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
763 int i;
764 for (i = 0; i < 8; ++i) {
765 _mm_store_si128((__m128i *)dst, above0_u16);
766 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
767 dst += stride;
768 _mm_store_si128((__m128i *)dst, above0_u16);
769 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
770 dst += stride;
771 _mm_store_si128((__m128i *)dst, above0_u16);
772 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
773 dst += stride;
774 _mm_store_si128((__m128i *)dst, above0_u16);
775 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
776 dst += stride;
777 }
778 }
779
aom_highbd_v_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)780 void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
781 const uint16_t *above,
782 const uint16_t *left, int bd) {
783 (void)left;
784 (void)bd;
785 const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
786 const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
787 const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
788 const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
789 int i;
790 for (i = 0; i < 4; ++i) {
791 _mm_store_si128((__m128i *)dst, above0_u16);
792 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
793 _mm_store_si128((__m128i *)(dst + 16), above2_u16);
794 _mm_store_si128((__m128i *)(dst + 24), above3_u16);
795 dst += stride;
796 _mm_store_si128((__m128i *)dst, above0_u16);
797 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
798 _mm_store_si128((__m128i *)(dst + 16), above2_u16);
799 _mm_store_si128((__m128i *)(dst + 24), above3_u16);
800 dst += stride;
801 _mm_store_si128((__m128i *)dst, above0_u16);
802 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
803 _mm_store_si128((__m128i *)(dst + 16), above2_u16);
804 _mm_store_si128((__m128i *)(dst + 24), above3_u16);
805 dst += stride;
806 _mm_store_si128((__m128i *)dst, above0_u16);
807 _mm_store_si128((__m128i *)(dst + 8), above1_u16);
808 _mm_store_si128((__m128i *)(dst + 16), above2_u16);
809 _mm_store_si128((__m128i *)(dst + 24), above3_u16);
810 dst += stride;
811 }
812 }
813
814 // -----------------------------------------------------------------------------
815 // DC_PRED
816
aom_highbd_dc_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)817 void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
818 const uint16_t *above,
819 const uint16_t *left, int bd) {
820 (void)bd;
821 const __m128i sum_above = dc_sum_4(above);
822 const __m128i sum_left = dc_sum_8(left);
823 const __m128i sum = _mm_add_epi16(sum_above, sum_left);
824 uint32_t sum32 = _mm_cvtsi128_si32(sum);
825 sum32 >>= 16;
826 sum32 += 6;
827 sum32 /= 12;
828 const __m128i row = _mm_set1_epi16((uint16_t)sum32);
829 int i;
830 for (i = 0; i < 4; ++i) {
831 _mm_storel_epi64((__m128i *)dst, row);
832 dst += stride;
833 _mm_storel_epi64((__m128i *)dst, row);
834 dst += stride;
835 }
836 }
837
aom_highbd_dc_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)838 void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
839 const uint16_t *above,
840 const uint16_t *left, int bd) {
841 (void)bd;
842 const __m128i sum_left = dc_sum_4(left);
843 const __m128i sum_above = dc_sum_8(above);
844 const __m128i sum = _mm_add_epi16(sum_above, sum_left);
845 uint32_t sum32 = _mm_cvtsi128_si32(sum);
846 sum32 >>= 16;
847 sum32 += 6;
848 sum32 /= 12;
849 const __m128i row = _mm_set1_epi16((uint16_t)sum32);
850
851 _mm_store_si128((__m128i *)dst, row);
852 dst += stride;
853 _mm_store_si128((__m128i *)dst, row);
854 dst += stride;
855 _mm_store_si128((__m128i *)dst, row);
856 dst += stride;
857 _mm_store_si128((__m128i *)dst, row);
858 }
859
aom_highbd_dc_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)860 void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
861 const uint16_t *above,
862 const uint16_t *left, int bd) {
863 (void)bd;
864 __m128i sum_left = dc_sum_16(left);
865 __m128i sum_above = dc_sum_8(above);
866 const __m128i zero = _mm_setzero_si128();
867 sum_left = _mm_unpacklo_epi16(sum_left, zero);
868 sum_above = _mm_unpacklo_epi16(sum_above, zero);
869 const __m128i sum = _mm_add_epi32(sum_left, sum_above);
870 uint32_t sum32 = _mm_cvtsi128_si32(sum);
871 sum32 += 12;
872 sum32 /= 24;
873 const __m128i row = _mm_set1_epi16((uint16_t)sum32);
874 int i;
875 for (i = 0; i < 4; ++i) {
876 _mm_store_si128((__m128i *)dst, row);
877 dst += stride;
878 _mm_store_si128((__m128i *)dst, row);
879 dst += stride;
880 _mm_store_si128((__m128i *)dst, row);
881 dst += stride;
882 _mm_store_si128((__m128i *)dst, row);
883 dst += stride;
884 }
885 }
886
aom_highbd_dc_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)887 void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
888 const uint16_t *above,
889 const uint16_t *left, int bd) {
890 (void)bd;
891 __m128i sum_left = dc_sum_8(left);
892 __m128i sum_above = dc_sum_16(above);
893 const __m128i zero = _mm_setzero_si128();
894 sum_left = _mm_unpacklo_epi16(sum_left, zero);
895 sum_above = _mm_unpacklo_epi16(sum_above, zero);
896 const __m128i sum = _mm_add_epi32(sum_left, sum_above);
897 uint32_t sum32 = _mm_cvtsi128_si32(sum);
898 sum32 += 12;
899 sum32 /= 24;
900 const __m128i row = _mm_set1_epi16((uint16_t)sum32);
901 int i;
902 for (i = 0; i < 2; ++i) {
903 _mm_store_si128((__m128i *)dst, row);
904 _mm_store_si128((__m128i *)(dst + 8), row);
905 dst += stride;
906 _mm_store_si128((__m128i *)dst, row);
907 _mm_store_si128((__m128i *)(dst + 8), row);
908 dst += stride;
909 _mm_store_si128((__m128i *)dst, row);
910 _mm_store_si128((__m128i *)(dst + 8), row);
911 dst += stride;
912 _mm_store_si128((__m128i *)dst, row);
913 _mm_store_si128((__m128i *)(dst + 8), row);
914 dst += stride;
915 }
916 }
917
aom_highbd_dc_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)918 void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
919 const uint16_t *above,
920 const uint16_t *left, int bd) {
921 (void)bd;
922 __m128i sum_left = dc_sum_32(left);
923 __m128i sum_above = dc_sum_16(above);
924 const __m128i zero = _mm_setzero_si128();
925 sum_above = _mm_unpacklo_epi16(sum_above, zero);
926 const __m128i sum = _mm_add_epi32(sum_left, sum_above);
927 uint32_t sum32 = _mm_cvtsi128_si32(sum);
928 sum32 += 24;
929 sum32 /= 48;
930 const __m128i row = _mm_set1_epi16((uint16_t)sum32);
931 int i;
932 for (i = 0; i < 8; ++i) {
933 _mm_store_si128((__m128i *)dst, row);
934 _mm_store_si128((__m128i *)(dst + 8), row);
935 dst += stride;
936 _mm_store_si128((__m128i *)dst, row);
937 _mm_store_si128((__m128i *)(dst + 8), row);
938 dst += stride;
939 _mm_store_si128((__m128i *)dst, row);
940 _mm_store_si128((__m128i *)(dst + 8), row);
941 dst += stride;
942 _mm_store_si128((__m128i *)dst, row);
943 _mm_store_si128((__m128i *)(dst + 8), row);
944 dst += stride;
945 }
946 }
947
aom_highbd_dc_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)948 void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
949 const uint16_t *above,
950 const uint16_t *left, int bd) {
951 (void)bd;
952 __m128i sum_left = dc_sum_16(left);
953 __m128i sum_above = dc_sum_32(above);
954 const __m128i zero = _mm_setzero_si128();
955 sum_left = _mm_unpacklo_epi16(sum_left, zero);
956 const __m128i sum = _mm_add_epi32(sum_left, sum_above);
957 uint32_t sum32 = _mm_cvtsi128_si32(sum);
958 sum32 += 24;
959 sum32 /= 48;
960 const __m128i row = _mm_set1_epi16((uint16_t)sum32);
961 int i;
962 for (i = 0; i < 4; ++i) {
963 _mm_store_si128((__m128i *)dst, row);
964 _mm_store_si128((__m128i *)(dst + 8), row);
965 _mm_store_si128((__m128i *)(dst + 16), row);
966 _mm_store_si128((__m128i *)(dst + 24), row);
967 dst += stride;
968 _mm_store_si128((__m128i *)dst, row);
969 _mm_store_si128((__m128i *)(dst + 8), row);
970 _mm_store_si128((__m128i *)(dst + 16), row);
971 _mm_store_si128((__m128i *)(dst + 24), row);
972 dst += stride;
973 _mm_store_si128((__m128i *)dst, row);
974 _mm_store_si128((__m128i *)(dst + 8), row);
975 _mm_store_si128((__m128i *)(dst + 16), row);
976 _mm_store_si128((__m128i *)(dst + 24), row);
977 dst += stride;
978 _mm_store_si128((__m128i *)dst, row);
979 _mm_store_si128((__m128i *)(dst + 8), row);
980 _mm_store_si128((__m128i *)(dst + 16), row);
981 _mm_store_si128((__m128i *)(dst + 24), row);
982 dst += stride;
983 }
984 }
985