• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <immintrin.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
16 #include "vpx_ports/mem.h"
17 
18 #if CONFIG_VP9_HIGHBITDEPTH
highbd_hadamard_col8_avx2(__m256i * in,int iter)19 static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
20   __m256i a0 = in[0];
21   __m256i a1 = in[1];
22   __m256i a2 = in[2];
23   __m256i a3 = in[3];
24   __m256i a4 = in[4];
25   __m256i a5 = in[5];
26   __m256i a6 = in[6];
27   __m256i a7 = in[7];
28 
29   __m256i b0 = _mm256_add_epi32(a0, a1);
30   __m256i b1 = _mm256_sub_epi32(a0, a1);
31   __m256i b2 = _mm256_add_epi32(a2, a3);
32   __m256i b3 = _mm256_sub_epi32(a2, a3);
33   __m256i b4 = _mm256_add_epi32(a4, a5);
34   __m256i b5 = _mm256_sub_epi32(a4, a5);
35   __m256i b6 = _mm256_add_epi32(a6, a7);
36   __m256i b7 = _mm256_sub_epi32(a6, a7);
37 
38   a0 = _mm256_add_epi32(b0, b2);
39   a1 = _mm256_add_epi32(b1, b3);
40   a2 = _mm256_sub_epi32(b0, b2);
41   a3 = _mm256_sub_epi32(b1, b3);
42   a4 = _mm256_add_epi32(b4, b6);
43   a5 = _mm256_add_epi32(b5, b7);
44   a6 = _mm256_sub_epi32(b4, b6);
45   a7 = _mm256_sub_epi32(b5, b7);
46 
47   if (iter == 0) {
48     b0 = _mm256_add_epi32(a0, a4);
49     b7 = _mm256_add_epi32(a1, a5);
50     b3 = _mm256_add_epi32(a2, a6);
51     b4 = _mm256_add_epi32(a3, a7);
52     b2 = _mm256_sub_epi32(a0, a4);
53     b6 = _mm256_sub_epi32(a1, a5);
54     b1 = _mm256_sub_epi32(a2, a6);
55     b5 = _mm256_sub_epi32(a3, a7);
56 
57     a0 = _mm256_unpacklo_epi32(b0, b1);
58     a1 = _mm256_unpacklo_epi32(b2, b3);
59     a2 = _mm256_unpackhi_epi32(b0, b1);
60     a3 = _mm256_unpackhi_epi32(b2, b3);
61     a4 = _mm256_unpacklo_epi32(b4, b5);
62     a5 = _mm256_unpacklo_epi32(b6, b7);
63     a6 = _mm256_unpackhi_epi32(b4, b5);
64     a7 = _mm256_unpackhi_epi32(b6, b7);
65 
66     b0 = _mm256_unpacklo_epi64(a0, a1);
67     b1 = _mm256_unpacklo_epi64(a4, a5);
68     b2 = _mm256_unpackhi_epi64(a0, a1);
69     b3 = _mm256_unpackhi_epi64(a4, a5);
70     b4 = _mm256_unpacklo_epi64(a2, a3);
71     b5 = _mm256_unpacklo_epi64(a6, a7);
72     b6 = _mm256_unpackhi_epi64(a2, a3);
73     b7 = _mm256_unpackhi_epi64(a6, a7);
74 
75     in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
76     in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
77     in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
78     in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
79     in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
80     in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
81     in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
82     in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
83   } else {
84     in[0] = _mm256_add_epi32(a0, a4);
85     in[7] = _mm256_add_epi32(a1, a5);
86     in[3] = _mm256_add_epi32(a2, a6);
87     in[4] = _mm256_add_epi32(a3, a7);
88     in[2] = _mm256_sub_epi32(a0, a4);
89     in[6] = _mm256_sub_epi32(a1, a5);
90     in[1] = _mm256_sub_epi32(a2, a6);
91     in[5] = _mm256_sub_epi32(a3, a7);
92   }
93 }
94 
vpx_highbd_hadamard_8x8_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)95 void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
96                                   tran_low_t *coeff) {
97   __m128i src16[8];
98   __m256i src32[8];
99 
100   src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
101   src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
102   src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
103   src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
104   src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
105   src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
106   src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
107   src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
108 
109   src32[0] = _mm256_cvtepi16_epi32(src16[0]);
110   src32[1] = _mm256_cvtepi16_epi32(src16[1]);
111   src32[2] = _mm256_cvtepi16_epi32(src16[2]);
112   src32[3] = _mm256_cvtepi16_epi32(src16[3]);
113   src32[4] = _mm256_cvtepi16_epi32(src16[4]);
114   src32[5] = _mm256_cvtepi16_epi32(src16[5]);
115   src32[6] = _mm256_cvtepi16_epi32(src16[6]);
116   src32[7] = _mm256_cvtepi16_epi32(src16[7]);
117 
118   highbd_hadamard_col8_avx2(src32, 0);
119   highbd_hadamard_col8_avx2(src32, 1);
120 
121   _mm256_storeu_si256((__m256i *)coeff, src32[0]);
122   coeff += 8;
123   _mm256_storeu_si256((__m256i *)coeff, src32[1]);
124   coeff += 8;
125   _mm256_storeu_si256((__m256i *)coeff, src32[2]);
126   coeff += 8;
127   _mm256_storeu_si256((__m256i *)coeff, src32[3]);
128   coeff += 8;
129   _mm256_storeu_si256((__m256i *)coeff, src32[4]);
130   coeff += 8;
131   _mm256_storeu_si256((__m256i *)coeff, src32[5]);
132   coeff += 8;
133   _mm256_storeu_si256((__m256i *)coeff, src32[6]);
134   coeff += 8;
135   _mm256_storeu_si256((__m256i *)coeff, src32[7]);
136 }
137 
vpx_highbd_hadamard_16x16_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)138 void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
139                                     ptrdiff_t src_stride, tran_low_t *coeff) {
140   int idx;
141   tran_low_t *t_coeff = coeff;
142   for (idx = 0; idx < 4; ++idx) {
143     const int16_t *src_ptr =
144         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
145     vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
146   }
147 
148   for (idx = 0; idx < 64; idx += 8) {
149     __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
150     __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
151     __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
152     __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
153 
154     __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
155     __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
156     __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
157     __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
158 
159     b0 = _mm256_srai_epi32(b0, 1);
160     b1 = _mm256_srai_epi32(b1, 1);
161     b2 = _mm256_srai_epi32(b2, 1);
162     b3 = _mm256_srai_epi32(b3, 1);
163 
164     coeff0 = _mm256_add_epi32(b0, b2);
165     coeff1 = _mm256_add_epi32(b1, b3);
166     coeff2 = _mm256_sub_epi32(b0, b2);
167     coeff3 = _mm256_sub_epi32(b1, b3);
168 
169     _mm256_storeu_si256((__m256i *)coeff, coeff0);
170     _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
171     _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
172     _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
173 
174     coeff += 8;
175     t_coeff += 8;
176   }
177 }
178 
vpx_highbd_hadamard_32x32_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)179 void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
180                                     ptrdiff_t src_stride, tran_low_t *coeff) {
181   int idx;
182   tran_low_t *t_coeff = coeff;
183   for (idx = 0; idx < 4; ++idx) {
184     const int16_t *src_ptr =
185         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
186     vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
187   }
188 
189   for (idx = 0; idx < 256; idx += 8) {
190     __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
191     __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
192     __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
193     __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
194 
195     __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
196     __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
197     __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
198     __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
199 
200     b0 = _mm256_srai_epi32(b0, 2);
201     b1 = _mm256_srai_epi32(b1, 2);
202     b2 = _mm256_srai_epi32(b2, 2);
203     b3 = _mm256_srai_epi32(b3, 2);
204 
205     coeff0 = _mm256_add_epi32(b0, b2);
206     coeff1 = _mm256_add_epi32(b1, b3);
207     coeff2 = _mm256_sub_epi32(b0, b2);
208     coeff3 = _mm256_sub_epi32(b1, b3);
209 
210     _mm256_storeu_si256((__m256i *)coeff, coeff0);
211     _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
212     _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
213     _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
214 
215     coeff += 8;
216     t_coeff += 8;
217   }
218 }
219 #endif  // CONFIG_VP9_HIGHBITDEPTH
220 
hadamard_col8x2_avx2(__m256i * in,int iter)221 static void hadamard_col8x2_avx2(__m256i *in, int iter) {
222   __m256i a0 = in[0];
223   __m256i a1 = in[1];
224   __m256i a2 = in[2];
225   __m256i a3 = in[3];
226   __m256i a4 = in[4];
227   __m256i a5 = in[5];
228   __m256i a6 = in[6];
229   __m256i a7 = in[7];
230 
231   __m256i b0 = _mm256_add_epi16(a0, a1);
232   __m256i b1 = _mm256_sub_epi16(a0, a1);
233   __m256i b2 = _mm256_add_epi16(a2, a3);
234   __m256i b3 = _mm256_sub_epi16(a2, a3);
235   __m256i b4 = _mm256_add_epi16(a4, a5);
236   __m256i b5 = _mm256_sub_epi16(a4, a5);
237   __m256i b6 = _mm256_add_epi16(a6, a7);
238   __m256i b7 = _mm256_sub_epi16(a6, a7);
239 
240   a0 = _mm256_add_epi16(b0, b2);
241   a1 = _mm256_add_epi16(b1, b3);
242   a2 = _mm256_sub_epi16(b0, b2);
243   a3 = _mm256_sub_epi16(b1, b3);
244   a4 = _mm256_add_epi16(b4, b6);
245   a5 = _mm256_add_epi16(b5, b7);
246   a6 = _mm256_sub_epi16(b4, b6);
247   a7 = _mm256_sub_epi16(b5, b7);
248 
249   if (iter == 0) {
250     b0 = _mm256_add_epi16(a0, a4);
251     b7 = _mm256_add_epi16(a1, a5);
252     b3 = _mm256_add_epi16(a2, a6);
253     b4 = _mm256_add_epi16(a3, a7);
254     b2 = _mm256_sub_epi16(a0, a4);
255     b6 = _mm256_sub_epi16(a1, a5);
256     b1 = _mm256_sub_epi16(a2, a6);
257     b5 = _mm256_sub_epi16(a3, a7);
258 
259     a0 = _mm256_unpacklo_epi16(b0, b1);
260     a1 = _mm256_unpacklo_epi16(b2, b3);
261     a2 = _mm256_unpackhi_epi16(b0, b1);
262     a3 = _mm256_unpackhi_epi16(b2, b3);
263     a4 = _mm256_unpacklo_epi16(b4, b5);
264     a5 = _mm256_unpacklo_epi16(b6, b7);
265     a6 = _mm256_unpackhi_epi16(b4, b5);
266     a7 = _mm256_unpackhi_epi16(b6, b7);
267 
268     b0 = _mm256_unpacklo_epi32(a0, a1);
269     b1 = _mm256_unpacklo_epi32(a4, a5);
270     b2 = _mm256_unpackhi_epi32(a0, a1);
271     b3 = _mm256_unpackhi_epi32(a4, a5);
272     b4 = _mm256_unpacklo_epi32(a2, a3);
273     b5 = _mm256_unpacklo_epi32(a6, a7);
274     b6 = _mm256_unpackhi_epi32(a2, a3);
275     b7 = _mm256_unpackhi_epi32(a6, a7);
276 
277     in[0] = _mm256_unpacklo_epi64(b0, b1);
278     in[1] = _mm256_unpackhi_epi64(b0, b1);
279     in[2] = _mm256_unpacklo_epi64(b2, b3);
280     in[3] = _mm256_unpackhi_epi64(b2, b3);
281     in[4] = _mm256_unpacklo_epi64(b4, b5);
282     in[5] = _mm256_unpackhi_epi64(b4, b5);
283     in[6] = _mm256_unpacklo_epi64(b6, b7);
284     in[7] = _mm256_unpackhi_epi64(b6, b7);
285   } else {
286     in[0] = _mm256_add_epi16(a0, a4);
287     in[7] = _mm256_add_epi16(a1, a5);
288     in[3] = _mm256_add_epi16(a2, a6);
289     in[4] = _mm256_add_epi16(a3, a7);
290     in[2] = _mm256_sub_epi16(a0, a4);
291     in[6] = _mm256_sub_epi16(a1, a5);
292     in[1] = _mm256_sub_epi16(a2, a6);
293     in[5] = _mm256_sub_epi16(a3, a7);
294   }
295 }
296 
hadamard_8x8x2_avx2(const int16_t * src_diff,ptrdiff_t src_stride,int16_t * coeff)297 static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
298                                 int16_t *coeff) {
299   __m256i src[8];
300   src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
301   src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
302   src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
303   src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
304   src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
305   src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
306   src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
307   src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
308 
309   hadamard_col8x2_avx2(src, 0);
310   hadamard_col8x2_avx2(src, 1);
311 
312   _mm256_storeu_si256((__m256i *)coeff,
313                       _mm256_permute2x128_si256(src[0], src[1], 0x20));
314   coeff += 16;
315   _mm256_storeu_si256((__m256i *)coeff,
316                       _mm256_permute2x128_si256(src[2], src[3], 0x20));
317   coeff += 16;
318   _mm256_storeu_si256((__m256i *)coeff,
319                       _mm256_permute2x128_si256(src[4], src[5], 0x20));
320   coeff += 16;
321   _mm256_storeu_si256((__m256i *)coeff,
322                       _mm256_permute2x128_si256(src[6], src[7], 0x20));
323   coeff += 16;
324   _mm256_storeu_si256((__m256i *)coeff,
325                       _mm256_permute2x128_si256(src[0], src[1], 0x31));
326   coeff += 16;
327   _mm256_storeu_si256((__m256i *)coeff,
328                       _mm256_permute2x128_si256(src[2], src[3], 0x31));
329   coeff += 16;
330   _mm256_storeu_si256((__m256i *)coeff,
331                       _mm256_permute2x128_si256(src[4], src[5], 0x31));
332   coeff += 16;
333   _mm256_storeu_si256((__m256i *)coeff,
334                       _mm256_permute2x128_si256(src[6], src[7], 0x31));
335 }
336 
hadamard_16x16_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff,int is_final)337 static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
338                                        ptrdiff_t src_stride, tran_low_t *coeff,
339                                        int is_final) {
340 #if CONFIG_VP9_HIGHBITDEPTH
341   DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
342   int16_t *t_coeff = temp_coeff;
343 #else
344   int16_t *t_coeff = coeff;
345 #endif
346   int16_t *coeff16 = (int16_t *)coeff;
347   int idx;
348   for (idx = 0; idx < 2; ++idx) {
349     const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
350     hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
351   }
352 
353   for (idx = 0; idx < 64; idx += 16) {
354     const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
355     const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
356     const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
357     const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
358 
359     __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
360     __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
361     __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
362     __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
363 
364     b0 = _mm256_srai_epi16(b0, 1);
365     b1 = _mm256_srai_epi16(b1, 1);
366     b2 = _mm256_srai_epi16(b2, 1);
367     b3 = _mm256_srai_epi16(b3, 1);
368     if (is_final) {
369       store_tran_low(_mm256_add_epi16(b0, b2), coeff);
370       store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
371       store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
372       store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
373       coeff += 16;
374     } else {
375       _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
376       _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
377       _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
378       _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
379       coeff16 += 16;
380     }
381     t_coeff += 16;
382   }
383 }
384 
vpx_hadamard_16x16_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)385 void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
386                              tran_low_t *coeff) {
387   hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
388 }
389 
vpx_hadamard_32x32_avx2(const int16_t * src_diff,ptrdiff_t src_stride,tran_low_t * coeff)390 void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
391                              tran_low_t *coeff) {
392 #if CONFIG_VP9_HIGHBITDEPTH
393   // For high bitdepths, it is unnecessary to store_tran_low
394   // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
395   // next stage.  Output to an intermediate buffer first, then store_tran_low()
396   // in the final stage.
397   DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
398   int16_t *t_coeff = temp_coeff;
399 #else
400   int16_t *t_coeff = coeff;
401 #endif
402   int idx;
403   for (idx = 0; idx < 4; ++idx) {
404     // src_diff: 9 bit, dynamic range [-255, 255]
405     const int16_t *src_ptr =
406         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
407     hadamard_16x16_avx2(src_ptr, src_stride,
408                         (tran_low_t *)(t_coeff + idx * 256), 0);
409   }
410 
411   for (idx = 0; idx < 256; idx += 16) {
412     const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
413     const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
414     const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
415     const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
416 
417     __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
418     __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
419     __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
420     __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
421 
422     b0 = _mm256_srai_epi16(b0, 2);
423     b1 = _mm256_srai_epi16(b1, 2);
424     b2 = _mm256_srai_epi16(b2, 2);
425     b3 = _mm256_srai_epi16(b3, 2);
426 
427     store_tran_low(_mm256_add_epi16(b0, b2), coeff);
428     store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
429     store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
430     store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
431 
432     coeff += 16;
433     t_coeff += 16;
434   }
435 }
436 
vpx_satd_avx2(const tran_low_t * coeff,int length)437 int vpx_satd_avx2(const tran_low_t *coeff, int length) {
438   const __m256i one = _mm256_set1_epi16(1);
439   __m256i accum = _mm256_setzero_si256();
440   int i;
441 
442   for (i = 0; i < length; i += 16) {
443     const __m256i src_line = load_tran_low(coeff);
444     const __m256i abs = _mm256_abs_epi16(src_line);
445     const __m256i sum = _mm256_madd_epi16(abs, one);
446     accum = _mm256_add_epi32(accum, sum);
447     coeff += 16;
448   }
449 
450   {  // 32 bit horizontal add
451     const __m256i a = _mm256_srli_si256(accum, 8);
452     const __m256i b = _mm256_add_epi32(accum, a);
453     const __m256i c = _mm256_srli_epi64(b, 32);
454     const __m256i d = _mm256_add_epi32(b, c);
455     const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
456                                             _mm256_extractf128_si256(d, 1));
457     return _mm_cvtsi128_si32(accum_128);
458   }
459 }
460 
461 #if CONFIG_VP9_HIGHBITDEPTH
vpx_highbd_satd_avx2(const tran_low_t * coeff,int length)462 int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
463   __m256i accum = _mm256_setzero_si256();
464   int i;
465 
466   for (i = 0; i < length; i += 8, coeff += 8) {
467     const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
468     const __m256i abs = _mm256_abs_epi32(src_line);
469     accum = _mm256_add_epi32(accum, abs);
470   }
471 
472   {  // 32 bit horizontal add
473     const __m256i a = _mm256_srli_si256(accum, 8);
474     const __m256i b = _mm256_add_epi32(accum, a);
475     const __m256i c = _mm256_srli_epi64(b, 32);
476     const __m256i d = _mm256_add_epi32(b, c);
477     const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
478                                             _mm256_extractf128_si256(d, 1));
479     return _mm_cvtsi128_si32(accum_128);
480   }
481 }
482 #endif  // CONFIG_VP9_HIGHBITDEPTH
483