• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "aom_dsp/aom_simd.h"
13 #define SIMD_FUNC(name) name##_avx2
14 #include "av1/common/cdef_block_simd.h"
15 
16 // Mask used to shuffle the elements present in 256bit register.
17 const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
18                                     0x0f0e0100, 0x0b0a0d0c, 0x07060908,
19                                     0x03020504, 0x0f0e0100 };
20 
21 /* partial A is a 16-bit vector of the form:
22 [x8 - - x1 | x16 - - x9] and partial B has the form:
23 [0  y1 - y7 | 0 y9 - y15].
24 This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
25 (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
26 are in const1 and const2. */
fold_mul_and_sum_avx2(__m256i * partiala,__m256i * partialb,const __m256i * const1,const __m256i * const2)27 static INLINE __m256i fold_mul_and_sum_avx2(__m256i *partiala,
28                                             __m256i *partialb,
29                                             const __m256i *const1,
30                                             const __m256i *const2) {
31   __m256i tmp;
32   /* Reverse partial B. */
33   *partialb = _mm256_shuffle_epi8(
34       *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
35 
36   /* Interleave the x and y values of identical indices and pair x8 with 0. */
37   tmp = *partiala;
38   *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
39   *partialb = _mm256_unpackhi_epi16(tmp, *partialb);
40 
41   /* Square and add the corresponding x and y values. */
42   *partiala = _mm256_madd_epi16(*partiala, *partiala);
43   *partialb = _mm256_madd_epi16(*partialb, *partialb);
44   /* Multiply by constant. */
45   *partiala = _mm256_mullo_epi32(*partiala, *const1);
46   *partialb = _mm256_mullo_epi32(*partialb, *const2);
47   /* Sum all results. */
48   *partiala = _mm256_add_epi32(*partiala, *partialb);
49   return *partiala;
50 }
51 
hsum4_avx2(__m256i * x0,__m256i * x1,__m256i * x2,__m256i * x3)52 static INLINE __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
53                                  __m256i *x3) {
54   const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
55   const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
56   const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
57   const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);
58 
59   *x0 = _mm256_unpacklo_epi64(t0, t1);
60   *x1 = _mm256_unpackhi_epi64(t0, t1);
61   *x2 = _mm256_unpacklo_epi64(t2, t3);
62   *x3 = _mm256_unpackhi_epi64(t2, t3);
63   return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
64                           _mm256_add_epi32(*x2, *x3));
65 }
66 
67 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
68 to compute the remaining directions. */
compute_directions_avx2(__m256i * lines,int32_t cost_frist_8x8[4],int32_t cost_second_8x8[4])69 static INLINE __m256i compute_directions_avx2(__m256i *lines,
70                                               int32_t cost_frist_8x8[4],
71                                               int32_t cost_second_8x8[4]) {
72   __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
73   __m256i partial6;
74   __m256i tmp;
75   /* Partial sums for lines 0 and 1. */
76   partial4a = _mm256_slli_si256(lines[0], 14);
77   partial4b = _mm256_srli_si256(lines[0], 2);
78   partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
79   partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
80   tmp = _mm256_add_epi16(lines[0], lines[1]);
81   partial5a = _mm256_slli_si256(tmp, 10);
82   partial5b = _mm256_srli_si256(tmp, 6);
83   partial7a = _mm256_slli_si256(tmp, 4);
84   partial7b = _mm256_srli_si256(tmp, 12);
85   partial6 = tmp;
86 
87   /* Partial sums for lines 2 and 3. */
88   partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
89   partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
90   partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
91   partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
92   tmp = _mm256_add_epi16(lines[2], lines[3]);
93   partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
94   partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
95   partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
96   partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
97   partial6 = _mm256_add_epi16(partial6, tmp);
98 
99   /* Partial sums for lines 4 and 5. */
100   partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
101   partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
102   partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
103   partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
104   tmp = _mm256_add_epi16(lines[4], lines[5]);
105   partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
106   partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
107   partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
108   partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
109   partial6 = _mm256_add_epi16(partial6, tmp);
110 
111   /* Partial sums for lines 6 and 7. */
112   partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
113   partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
114   partial4a = _mm256_add_epi16(partial4a, lines[7]);
115   tmp = _mm256_add_epi16(lines[6], lines[7]);
116   partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
117   partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
118   partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
119   partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
120   partial6 = _mm256_add_epi16(partial6, tmp);
121 
122   const __m256i const_reg_1 =
123       _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
124   const __m256i const_reg_2 =
125       _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
126   const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
127   const __m256i const_reg_4 =
128       _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
129 
130   /* Compute costs in terms of partial sums. */
131   partial4a =
132       fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
133   partial7a =
134       fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
135   partial5a =
136       fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
137   partial6 = _mm256_madd_epi16(partial6, partial6);
138   partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
139 
140   partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
141   _mm_storeu_si128((__m128i *)cost_frist_8x8,
142                    _mm256_castsi256_si128(partial4a));
143   _mm_storeu_si128((__m128i *)cost_second_8x8,
144                    _mm256_extractf128_si256(partial4a, 1));
145 
146   return partial4a;
147 }
148 
149 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
150 counter-clockwise rotation of the pixels. */
array_reverse_transpose_8x8_avx2(__m256i * in,__m256i * res)151 static INLINE void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
152   const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
153   const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
154   const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
155   const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
156   const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
157   const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
158   const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
159   const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
160 
161   const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
162   const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
163   const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
164   const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
165   const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
166   const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
167   const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
168   const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
169 
170   res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
171   res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
172   res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
173   res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
174   res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
175   res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
176   res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
177   res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
178 }
179 
cdef_find_dir_dual_avx2(const uint16_t * img1,const uint16_t * img2,int stride,int32_t * var_out_1st,int32_t * var_out_2nd,int coeff_shift,int * out_dir_1st_8x8,int * out_dir_2nd_8x8)180 void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2,
181                              int stride, int32_t *var_out_1st,
182                              int32_t *var_out_2nd, int coeff_shift,
183                              int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
184   int32_t cost_first_8x8[8];
185   int32_t cost_second_8x8[8];
186   // Used to store the best cost for 2 8x8's.
187   int32_t best_cost[2] = { 0 };
188   // Best direction for 2 8x8's.
189   int best_dir[2] = { 0 };
190 
191   const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
192   const __m256i const_128_reg = _mm256_set1_epi16(128);
193   __m256i lines[8];
194   for (int i = 0; i < 8; i++) {
195     const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
196     const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);
197 
198     lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
199     lines[i] = _mm256_sub_epi16(
200         _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
201   }
202 
203   /* Compute "mostly vertical" directions. */
204   const __m256i dir47 =
205       compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
206 
207   /* Transpose and reverse the order of the lines. */
208   array_reverse_transpose_8x8_avx2(lines, lines);
209 
210   /* Compute "mostly horizontal" directions. */
211   const __m256i dir03 =
212       compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
213 
214   __m256i max = _mm256_max_epi32(dir03, dir47);
215   max =
216       _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
217                                             _mm256_slli_si256(max, 16 - (8))));
218   max =
219       _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
220                                             _mm256_slli_si256(max, 16 - (4))));
221 
222   const __m128i first_8x8_output = _mm256_castsi256_si128(max);
223   const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
224   const __m128i cmpeg_res_00 =
225       _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
226   const __m128i cmpeg_res_01 =
227       _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
228   const __m128i cmpeg_res_10 =
229       _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
230   const __m128i cmpeg_res_11 =
231       _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
232   const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
233   const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
234 
235   best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
236   best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
237   best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
238   best_dir[0] =
239       get_msb(best_dir[0] ^ (best_dir[0] - 1));  // Count trailing zeros
240   best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
241   best_dir[1] =
242       get_msb(best_dir[1] ^ (best_dir[1] - 1));  // Count trailing zeros
243 
244   /* Difference between the optimal variance and the variance along the
245      orthogonal direction. Again, the sum(x^2) terms cancel out. */
246   *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
247   *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
248 
249   /* We'd normally divide by 840, but dividing by 1024 is close enough
250   for what we're going to do with this. */
251   *var_out_1st >>= 10;
252   *var_out_2nd >>= 10;
253   *out_dir_1st_8x8 = best_dir[0];
254   *out_dir_2nd_8x8 = best_dir[1];
255 }
256 
cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t * dst,int dstride,const uint8_t * src,int sstride,int width,int height)257 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
258                                         const uint8_t *src, int sstride,
259                                         int width, int height) {
260   int j = 0;
261   int remaining_width = width;
262   assert(height % 2 == 0);
263   assert(height > 0);
264   assert(width > 0);
265 
266   // Process multiple 32 pixels at a time.
267   if (remaining_width > 31) {
268     int i = 0;
269     do {
270       j = 0;
271       do {
272         __m128i row00 =
273             _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
274         __m128i row01 = _mm_loadu_si128(
275             (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
276         __m128i row10 =
277             _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
278         __m128i row11 = _mm_loadu_si128(
279             (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
280         _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
281                             _mm256_cvtepu8_epi16(row00));
282         _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
283                             _mm256_cvtepu8_epi16(row01));
284         _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
285                             _mm256_cvtepu8_epi16(row10));
286         _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
287                             _mm256_cvtepu8_epi16(row11));
288         j += 32;
289       } while (j <= width - 32);
290       i += 2;
291     } while (i < height);
292     remaining_width = width & 31;
293   }
294 
295   // Process 16 pixels at a time.
296   if (remaining_width > 15) {
297     int i = 0;
298     do {
299       __m128i row0 =
300           _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
301       __m128i row1 =
302           _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
303       _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
304                           _mm256_cvtepu8_epi16(row0));
305       _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
306                           _mm256_cvtepu8_epi16(row1));
307       i += 2;
308     } while (i < height);
309     remaining_width = width & 15;
310     j += 16;
311   }
312 
313   // Process 8 pixels at a time.
314   if (remaining_width > 7) {
315     int i = 0;
316     do {
317       __m128i row0 =
318           _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
319       __m128i row1 =
320           _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
321       _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
322                        _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
323       _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
324                        _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
325       i += 2;
326     } while (i < height);
327     remaining_width = width & 7;
328     j += 8;
329   }
330 
331   // Process 4 pixels at a time.
332   if (remaining_width > 3) {
333     int i = 0;
334     do {
335       __m128i row0 =
336           _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
337       __m128i row1 =
338           _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
339       _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
340                        _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
341       _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
342                        _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
343       i += 2;
344     } while (i < height);
345     remaining_width = width & 3;
346     j += 4;
347   }
348 
349   // Process the remaining pixels.
350   if (remaining_width) {
351     for (int i = 0; i < height; i++) {
352       for (int k = j; k < width; k++) {
353         dst[i * dstride + k] = src[i * sstride + k];
354       }
355     }
356   }
357 }
358