• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <immintrin.h>
13 
14 #include "config/av1_rtcd.h"
15 #include "aom_dsp/x86/intrapred_x86.h"
16 #include "aom_dsp/x86/intrapred_utils.h"
17 #include "aom_dsp/x86/lpf_common_sse2.h"
18 
dc_sum_64(const uint8_t * ref)19 static INLINE __m256i dc_sum_64(const uint8_t *ref) {
20   const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
21   const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
22   const __m256i zero = _mm256_setzero_si256();
23   __m256i y0 = _mm256_sad_epu8(x0, zero);
24   __m256i y1 = _mm256_sad_epu8(x1, zero);
25   y0 = _mm256_add_epi64(y0, y1);
26   __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
27   y0 = _mm256_add_epi64(u0, y0);
28   u0 = _mm256_unpackhi_epi64(y0, y0);
29   return _mm256_add_epi16(y0, u0);
30 }
31 
dc_sum_32(const uint8_t * ref)32 static INLINE __m256i dc_sum_32(const uint8_t *ref) {
33   const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
34   const __m256i zero = _mm256_setzero_si256();
35   __m256i y = _mm256_sad_epu8(x, zero);
36   __m256i u = _mm256_permute2x128_si256(y, y, 1);
37   y = _mm256_add_epi64(u, y);
38   u = _mm256_unpackhi_epi64(y, y);
39   return _mm256_add_epi16(y, u);
40 }
41 
row_store_32xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)42 static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
43                                   ptrdiff_t stride) {
44   for (int i = 0; i < height; ++i) {
45     _mm256_storeu_si256((__m256i *)dst, *r);
46     dst += stride;
47   }
48 }
49 
row_store_32x2xh(const __m256i * r0,const __m256i * r1,int height,uint8_t * dst,ptrdiff_t stride)50 static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
51                                     int height, uint8_t *dst,
52                                     ptrdiff_t stride) {
53   for (int i = 0; i < height; ++i) {
54     _mm256_storeu_si256((__m256i *)dst, *r0);
55     _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
56     dst += stride;
57   }
58 }
59 
row_store_64xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)60 static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
61                                   ptrdiff_t stride) {
62   for (int i = 0; i < height; ++i) {
63     _mm256_storeu_si256((__m256i *)dst, *r);
64     _mm256_storeu_si256((__m256i *)(dst + 32), *r);
65     dst += stride;
66   }
67 }
68 
69 static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
70   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
71   { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
72   { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
73   { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
74   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
75   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
76   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
77   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
78 };
79 
80 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
81   { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
82   { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
83   { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
84   { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
85 };
86 
87 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
88   { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
89     2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
90   { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
91     0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
92   { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
93     0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
94   { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
95     0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
96   { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
97     0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
98   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
99     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
100   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
101     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
102   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
103     0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
104 };
105 
106 static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
107   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
108   { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
109   { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110   { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
111   { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
112   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
113   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114     0 },
115   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
116     0, 0 },
117   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
118     0, 0, 0, 0 },
119   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
120     0, 0, 0, 0, 0, 0 },
121   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
122     0xffff, 0, 0, 0, 0, 0, 0 },
123   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
124     0xffff, 0xffff, 0, 0, 0, 0, 0 },
125   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
126     0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
127   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
128     0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
129   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
130     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
131   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
132     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
133   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
134     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
135 };
136 
highbd_transpose16x4_8x8_sse2(__m128i * x,__m128i * d)137 static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
138   __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
139 
140   r0 = _mm_unpacklo_epi16(x[0], x[1]);
141   r1 = _mm_unpacklo_epi16(x[2], x[3]);
142   r2 = _mm_unpacklo_epi16(x[4], x[5]);
143   r3 = _mm_unpacklo_epi16(x[6], x[7]);
144 
145   r4 = _mm_unpacklo_epi16(x[8], x[9]);
146   r5 = _mm_unpacklo_epi16(x[10], x[11]);
147   r6 = _mm_unpacklo_epi16(x[12], x[13]);
148   r7 = _mm_unpacklo_epi16(x[14], x[15]);
149 
150   r8 = _mm_unpacklo_epi32(r0, r1);
151   r9 = _mm_unpackhi_epi32(r0, r1);
152   r10 = _mm_unpacklo_epi32(r2, r3);
153   r11 = _mm_unpackhi_epi32(r2, r3);
154 
155   r12 = _mm_unpacklo_epi32(r4, r5);
156   r13 = _mm_unpackhi_epi32(r4, r5);
157   r14 = _mm_unpacklo_epi32(r6, r7);
158   r15 = _mm_unpackhi_epi32(r6, r7);
159 
160   r0 = _mm_unpacklo_epi64(r8, r9);
161   r1 = _mm_unpackhi_epi64(r8, r9);
162   r2 = _mm_unpacklo_epi64(r10, r11);
163   r3 = _mm_unpackhi_epi64(r10, r11);
164 
165   r4 = _mm_unpacklo_epi64(r12, r13);
166   r5 = _mm_unpackhi_epi64(r12, r13);
167   r6 = _mm_unpacklo_epi64(r14, r15);
168   r7 = _mm_unpackhi_epi64(r14, r15);
169 
170   d[0] = _mm_unpacklo_epi64(r0, r2);
171   d[1] = _mm_unpacklo_epi64(r4, r6);
172   d[2] = _mm_unpacklo_epi64(r1, r3);
173   d[3] = _mm_unpacklo_epi64(r5, r7);
174 
175   d[4] = _mm_unpackhi_epi64(r0, r2);
176   d[5] = _mm_unpackhi_epi64(r4, r6);
177   d[6] = _mm_unpackhi_epi64(r1, r3);
178   d[7] = _mm_unpackhi_epi64(r5, r7);
179 }
180 
highbd_transpose4x16_avx2(__m256i * x,__m256i * d)181 static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
182   __m256i w0, w1, w2, w3, ww0, ww1;
183 
184   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
185   w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
186   w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
187   w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
188 
189   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
190   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
191 
192   d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
193   d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
194 
195   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
196   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
197 
198   d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
199   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
200 }
201 
highbd_transpose8x16_16x8_avx2(__m256i * x,__m256i * d)202 static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
203   __m256i w0, w1, w2, w3, ww0, ww1;
204 
205   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
206   w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
207   w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
208   w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
209 
210   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
211   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
212 
213   d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
214   d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
215 
216   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
217   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
218 
219   d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
220   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
221 
222   w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
223   w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
224   w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
225   w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
226 
227   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
228   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
229 
230   d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
231   d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
232 
233   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
234   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
235 
236   d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
237   d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
238 }
239 
highbd_transpose16x16_avx2(__m256i * x,__m256i * d)240 static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
241   __m256i w0, w1, w2, w3, ww0, ww1;
242   __m256i dd[16];
243   w0 = _mm256_unpacklo_epi16(x[0], x[1]);
244   w1 = _mm256_unpacklo_epi16(x[2], x[3]);
245   w2 = _mm256_unpacklo_epi16(x[4], x[5]);
246   w3 = _mm256_unpacklo_epi16(x[6], x[7]);
247 
248   ww0 = _mm256_unpacklo_epi32(w0, w1);  //
249   ww1 = _mm256_unpacklo_epi32(w2, w3);  //
250 
251   dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
252   dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
253 
254   ww0 = _mm256_unpackhi_epi32(w0, w1);  //
255   ww1 = _mm256_unpackhi_epi32(w2, w3);  //
256 
257   dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
258   dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
259 
260   w0 = _mm256_unpackhi_epi16(x[0], x[1]);
261   w1 = _mm256_unpackhi_epi16(x[2], x[3]);
262   w2 = _mm256_unpackhi_epi16(x[4], x[5]);
263   w3 = _mm256_unpackhi_epi16(x[6], x[7]);
264 
265   ww0 = _mm256_unpacklo_epi32(w0, w1);  //
266   ww1 = _mm256_unpacklo_epi32(w2, w3);  //
267 
268   dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
269   dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
270 
271   ww0 = _mm256_unpackhi_epi32(w0, w1);  //
272   ww1 = _mm256_unpackhi_epi32(w2, w3);  //
273 
274   dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
275   dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
276 
277   w0 = _mm256_unpacklo_epi16(x[8], x[9]);
278   w1 = _mm256_unpacklo_epi16(x[10], x[11]);
279   w2 = _mm256_unpacklo_epi16(x[12], x[13]);
280   w3 = _mm256_unpacklo_epi16(x[14], x[15]);
281 
282   ww0 = _mm256_unpacklo_epi32(w0, w1);
283   ww1 = _mm256_unpacklo_epi32(w2, w3);
284 
285   dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
286   dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
287 
288   ww0 = _mm256_unpackhi_epi32(w0, w1);
289   ww1 = _mm256_unpackhi_epi32(w2, w3);
290 
291   dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
292   dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
293 
294   w0 = _mm256_unpackhi_epi16(x[8], x[9]);
295   w1 = _mm256_unpackhi_epi16(x[10], x[11]);
296   w2 = _mm256_unpackhi_epi16(x[12], x[13]);
297   w3 = _mm256_unpackhi_epi16(x[14], x[15]);
298 
299   ww0 = _mm256_unpacklo_epi32(w0, w1);
300   ww1 = _mm256_unpacklo_epi32(w2, w3);
301 
302   dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
303   dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
304 
305   ww0 = _mm256_unpackhi_epi32(w0, w1);
306   ww1 = _mm256_unpackhi_epi32(w2, w3);
307 
308   dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
309   dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
310 
311   for (int i = 0; i < 8; i++) {
312     d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
313     d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
314                                        _mm256_extracti128_si256(dd[i], 1), 0);
315   }
316 }
317 
aom_dc_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)318 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
319                                  const uint8_t *above, const uint8_t *left) {
320   const __m256i sum_above = dc_sum_32(above);
321   __m256i sum_left = dc_sum_32(left);
322   sum_left = _mm256_add_epi16(sum_left, sum_above);
323   const __m256i thirtytwo = _mm256_set1_epi16(32);
324   sum_left = _mm256_add_epi16(sum_left, thirtytwo);
325   sum_left = _mm256_srai_epi16(sum_left, 6);
326   const __m256i zero = _mm256_setzero_si256();
327   __m256i row = _mm256_shuffle_epi8(sum_left, zero);
328   row_store_32xh(&row, 32, dst, stride);
329 }
330 
aom_dc_top_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)331 void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
332                                      const uint8_t *above,
333                                      const uint8_t *left) {
334   __m256i sum = dc_sum_32(above);
335   (void)left;
336 
337   const __m256i sixteen = _mm256_set1_epi16(16);
338   sum = _mm256_add_epi16(sum, sixteen);
339   sum = _mm256_srai_epi16(sum, 5);
340   const __m256i zero = _mm256_setzero_si256();
341   __m256i row = _mm256_shuffle_epi8(sum, zero);
342   row_store_32xh(&row, 32, dst, stride);
343 }
344 
aom_dc_left_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)345 void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
346                                       const uint8_t *above,
347                                       const uint8_t *left) {
348   __m256i sum = dc_sum_32(left);
349   (void)above;
350 
351   const __m256i sixteen = _mm256_set1_epi16(16);
352   sum = _mm256_add_epi16(sum, sixteen);
353   sum = _mm256_srai_epi16(sum, 5);
354   const __m256i zero = _mm256_setzero_si256();
355   __m256i row = _mm256_shuffle_epi8(sum, zero);
356   row_store_32xh(&row, 32, dst, stride);
357 }
358 
aom_dc_128_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)359 void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
360                                      const uint8_t *above,
361                                      const uint8_t *left) {
362   (void)above;
363   (void)left;
364   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
365   row_store_32xh(&row, 32, dst, stride);
366 }
367 
aom_v_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)368 void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
369                                 const uint8_t *above, const uint8_t *left) {
370   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
371   (void)left;
372   row_store_32xh(&row, 32, dst, stride);
373 }
374 
375 // There are 32 rows togeter. This function does line:
376 // 0,1,2,3, and 16,17,18,19. The next call would do
377 // 4,5,6,7, and 20,21,22,23. So 4 times of calling
378 // would finish 32 rows.
h_predictor_32x8line(const __m256i * row,uint8_t * dst,ptrdiff_t stride)379 static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
380                                         ptrdiff_t stride) {
381   __m256i t[4];
382   __m256i m = _mm256_setzero_si256();
383   const __m256i inc = _mm256_set1_epi8(4);
384   int i;
385 
386   for (i = 0; i < 4; i++) {
387     t[i] = _mm256_shuffle_epi8(*row, m);
388     __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
389     __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
390     _mm256_storeu_si256((__m256i *)dst, r0);
391     _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
392     dst += stride;
393     m = _mm256_add_epi8(m, inc);
394   }
395 }
396 
aom_h_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)397 void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
398                                 const uint8_t *above, const uint8_t *left) {
399   (void)above;
400   const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
401 
402   __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
403 
404   __m256i v = _mm256_unpacklo_epi8(u, u);
405   h_predictor_32x8line(&v, dst, stride);
406   dst += stride << 2;
407 
408   v = _mm256_unpackhi_epi8(u, u);
409   h_predictor_32x8line(&v, dst, stride);
410   dst += stride << 2;
411 
412   u = _mm256_unpackhi_epi8(left_col, left_col);
413 
414   v = _mm256_unpacklo_epi8(u, u);
415   h_predictor_32x8line(&v, dst, stride);
416   dst += stride << 2;
417 
418   v = _mm256_unpackhi_epi8(u, u);
419   h_predictor_32x8line(&v, dst, stride);
420 }
421 
422 // -----------------------------------------------------------------------------
423 // Rectangle
aom_dc_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)424 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
425                                  const uint8_t *above, const uint8_t *left) {
426   const __m128i top_sum = dc_sum_32_sse2(above);
427   __m128i left_sum = dc_sum_16_sse2(left);
428   left_sum = _mm_add_epi16(top_sum, left_sum);
429   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
430   sum += 24;
431   sum /= 48;
432   const __m256i row = _mm256_set1_epi8((int8_t)sum);
433   row_store_32xh(&row, 16, dst, stride);
434 }
435 
aom_dc_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)436 void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
437                                  const uint8_t *above, const uint8_t *left) {
438   const __m256i sum_above = dc_sum_32(above);
439   __m256i sum_left = dc_sum_64(left);
440   sum_left = _mm256_add_epi16(sum_left, sum_above);
441   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
442   sum += 48;
443   sum /= 96;
444   const __m256i row = _mm256_set1_epi8((int8_t)sum);
445   row_store_32xh(&row, 64, dst, stride);
446 }
447 
aom_dc_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)448 void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
449                                  const uint8_t *above, const uint8_t *left) {
450   const __m256i sum_above = dc_sum_64(above);
451   __m256i sum_left = dc_sum_64(left);
452   sum_left = _mm256_add_epi16(sum_left, sum_above);
453   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
454   sum += 64;
455   sum /= 128;
456   const __m256i row = _mm256_set1_epi8((int8_t)sum);
457   row_store_64xh(&row, 64, dst, stride);
458 }
459 
aom_dc_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)460 void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
461                                  const uint8_t *above, const uint8_t *left) {
462   const __m256i sum_above = dc_sum_64(above);
463   __m256i sum_left = dc_sum_32(left);
464   sum_left = _mm256_add_epi16(sum_left, sum_above);
465   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
466   sum += 48;
467   sum /= 96;
468   const __m256i row = _mm256_set1_epi8((int8_t)sum);
469   row_store_64xh(&row, 32, dst, stride);
470 }
471 
aom_dc_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)472 void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
473                                  const uint8_t *above, const uint8_t *left) {
474   const __m256i sum_above = dc_sum_64(above);
475   __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
476   sum_left = _mm256_add_epi16(sum_left, sum_above);
477   uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
478   sum += 40;
479   sum /= 80;
480   const __m256i row = _mm256_set1_epi8((int8_t)sum);
481   row_store_64xh(&row, 16, dst, stride);
482 }
483 
aom_dc_top_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)484 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
485                                      const uint8_t *above,
486                                      const uint8_t *left) {
487   __m256i sum = dc_sum_32(above);
488   (void)left;
489 
490   const __m256i sixteen = _mm256_set1_epi16(16);
491   sum = _mm256_add_epi16(sum, sixteen);
492   sum = _mm256_srai_epi16(sum, 5);
493   const __m256i zero = _mm256_setzero_si256();
494   __m256i row = _mm256_shuffle_epi8(sum, zero);
495   row_store_32xh(&row, 16, dst, stride);
496 }
497 
aom_dc_top_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)498 void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
499                                      const uint8_t *above,
500                                      const uint8_t *left) {
501   __m256i sum = dc_sum_32(above);
502   (void)left;
503 
504   const __m256i sixteen = _mm256_set1_epi16(16);
505   sum = _mm256_add_epi16(sum, sixteen);
506   sum = _mm256_srai_epi16(sum, 5);
507   const __m256i zero = _mm256_setzero_si256();
508   __m256i row = _mm256_shuffle_epi8(sum, zero);
509   row_store_32xh(&row, 64, dst, stride);
510 }
511 
aom_dc_top_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)512 void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
513                                      const uint8_t *above,
514                                      const uint8_t *left) {
515   __m256i sum = dc_sum_64(above);
516   (void)left;
517 
518   const __m256i thirtytwo = _mm256_set1_epi16(32);
519   sum = _mm256_add_epi16(sum, thirtytwo);
520   sum = _mm256_srai_epi16(sum, 6);
521   const __m256i zero = _mm256_setzero_si256();
522   __m256i row = _mm256_shuffle_epi8(sum, zero);
523   row_store_64xh(&row, 64, dst, stride);
524 }
525 
aom_dc_top_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)526 void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
527                                      const uint8_t *above,
528                                      const uint8_t *left) {
529   __m256i sum = dc_sum_64(above);
530   (void)left;
531 
532   const __m256i thirtytwo = _mm256_set1_epi16(32);
533   sum = _mm256_add_epi16(sum, thirtytwo);
534   sum = _mm256_srai_epi16(sum, 6);
535   const __m256i zero = _mm256_setzero_si256();
536   __m256i row = _mm256_shuffle_epi8(sum, zero);
537   row_store_64xh(&row, 32, dst, stride);
538 }
539 
aom_dc_top_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)540 void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
541                                      const uint8_t *above,
542                                      const uint8_t *left) {
543   __m256i sum = dc_sum_64(above);
544   (void)left;
545 
546   const __m256i thirtytwo = _mm256_set1_epi16(32);
547   sum = _mm256_add_epi16(sum, thirtytwo);
548   sum = _mm256_srai_epi16(sum, 6);
549   const __m256i zero = _mm256_setzero_si256();
550   __m256i row = _mm256_shuffle_epi8(sum, zero);
551   row_store_64xh(&row, 16, dst, stride);
552 }
553 
aom_dc_left_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)554 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
555                                       const uint8_t *above,
556                                       const uint8_t *left) {
557   __m128i sum = dc_sum_16_sse2(left);
558   (void)above;
559 
560   const __m128i eight = _mm_set1_epi16(8);
561   sum = _mm_add_epi16(sum, eight);
562   sum = _mm_srai_epi16(sum, 4);
563   const __m128i zero = _mm_setzero_si128();
564   const __m128i r = _mm_shuffle_epi8(sum, zero);
565   const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
566   row_store_32xh(&row, 16, dst, stride);
567 }
568 
aom_dc_left_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)569 void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
570                                       const uint8_t *above,
571                                       const uint8_t *left) {
572   __m256i sum = dc_sum_64(left);
573   (void)above;
574 
575   const __m256i thirtytwo = _mm256_set1_epi16(32);
576   sum = _mm256_add_epi16(sum, thirtytwo);
577   sum = _mm256_srai_epi16(sum, 6);
578   const __m256i zero = _mm256_setzero_si256();
579   __m256i row = _mm256_shuffle_epi8(sum, zero);
580   row_store_32xh(&row, 64, dst, stride);
581 }
582 
aom_dc_left_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)583 void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
584                                       const uint8_t *above,
585                                       const uint8_t *left) {
586   __m256i sum = dc_sum_64(left);
587   (void)above;
588 
589   const __m256i thirtytwo = _mm256_set1_epi16(32);
590   sum = _mm256_add_epi16(sum, thirtytwo);
591   sum = _mm256_srai_epi16(sum, 6);
592   const __m256i zero = _mm256_setzero_si256();
593   __m256i row = _mm256_shuffle_epi8(sum, zero);
594   row_store_64xh(&row, 64, dst, stride);
595 }
596 
aom_dc_left_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)597 void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
598                                       const uint8_t *above,
599                                       const uint8_t *left) {
600   __m256i sum = dc_sum_32(left);
601   (void)above;
602 
603   const __m256i sixteen = _mm256_set1_epi16(16);
604   sum = _mm256_add_epi16(sum, sixteen);
605   sum = _mm256_srai_epi16(sum, 5);
606   const __m256i zero = _mm256_setzero_si256();
607   __m256i row = _mm256_shuffle_epi8(sum, zero);
608   row_store_64xh(&row, 32, dst, stride);
609 }
610 
aom_dc_left_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)611 void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
612                                       const uint8_t *above,
613                                       const uint8_t *left) {
614   __m128i sum = dc_sum_16_sse2(left);
615   (void)above;
616 
617   const __m128i eight = _mm_set1_epi16(8);
618   sum = _mm_add_epi16(sum, eight);
619   sum = _mm_srai_epi16(sum, 4);
620   const __m128i zero = _mm_setzero_si128();
621   const __m128i r = _mm_shuffle_epi8(sum, zero);
622   const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
623   row_store_64xh(&row, 16, dst, stride);
624 }
625 
aom_dc_128_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)626 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
627                                      const uint8_t *above,
628                                      const uint8_t *left) {
629   (void)above;
630   (void)left;
631   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
632   row_store_32xh(&row, 16, dst, stride);
633 }
634 
aom_dc_128_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)635 void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
636                                      const uint8_t *above,
637                                      const uint8_t *left) {
638   (void)above;
639   (void)left;
640   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
641   row_store_32xh(&row, 64, dst, stride);
642 }
643 
aom_dc_128_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)644 void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
645                                      const uint8_t *above,
646                                      const uint8_t *left) {
647   (void)above;
648   (void)left;
649   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
650   row_store_64xh(&row, 64, dst, stride);
651 }
652 
aom_dc_128_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)653 void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
654                                      const uint8_t *above,
655                                      const uint8_t *left) {
656   (void)above;
657   (void)left;
658   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
659   row_store_64xh(&row, 32, dst, stride);
660 }
661 
aom_dc_128_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)662 void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
663                                      const uint8_t *above,
664                                      const uint8_t *left) {
665   (void)above;
666   (void)left;
667   const __m256i row = _mm256_set1_epi8((int8_t)0x80);
668   row_store_64xh(&row, 16, dst, stride);
669 }
670 
aom_v_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)671 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
672                                 const uint8_t *above, const uint8_t *left) {
673   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
674   (void)left;
675   row_store_32xh(&row, 16, dst, stride);
676 }
677 
aom_v_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)678 void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
679                                 const uint8_t *above, const uint8_t *left) {
680   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
681   (void)left;
682   row_store_32xh(&row, 64, dst, stride);
683 }
684 
aom_v_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)685 void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
686                                 const uint8_t *above, const uint8_t *left) {
687   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
688   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
689   (void)left;
690   row_store_32x2xh(&row0, &row1, 64, dst, stride);
691 }
692 
aom_v_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)693 void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
694                                 const uint8_t *above, const uint8_t *left) {
695   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
696   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
697   (void)left;
698   row_store_32x2xh(&row0, &row1, 32, dst, stride);
699 }
700 
aom_v_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)701 void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
702                                 const uint8_t *above, const uint8_t *left) {
703   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
704   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
705   (void)left;
706   row_store_32x2xh(&row0, &row1, 16, dst, stride);
707 }
708 
709 // -----------------------------------------------------------------------------
710 // PAETH_PRED
711 
712 // Return 16 16-bit pixels in one row (__m256i)
paeth_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)713 static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
714                                  const __m256i *topleft) {
715   const __m256i base =
716       _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
717 
718   __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
719   __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
720   __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
721 
722   __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
723   mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
724   __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
725 
726   pl = _mm256_andnot_si256(mask1, *left);
727 
728   ptl = _mm256_and_si256(mask2, *topleft);
729   pt = _mm256_andnot_si256(mask2, *top);
730   pt = _mm256_or_si256(pt, ptl);
731   pt = _mm256_and_si256(mask1, pt);
732 
733   return _mm256_or_si256(pt, pl);
734 }
735 
736 // Return 16 8-bit pixels in one row (__m128i)
paeth_16x1_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)737 static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
738                                       const __m256i *topleft) {
739   const __m256i p0 = paeth_pred(left, top, topleft);
740   const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
741   const __m256i p = _mm256_packus_epi16(p0, p1);
742   return _mm256_castsi256_si128(p);
743 }
744 
get_top_vector(const uint8_t * above)745 static INLINE __m256i get_top_vector(const uint8_t *above) {
746   const __m128i x = _mm_load_si128((const __m128i *)above);
747   const __m128i zero = _mm_setzero_si128();
748   const __m128i t0 = _mm_unpacklo_epi8(x, zero);
749   const __m128i t1 = _mm_unpackhi_epi8(x, zero);
750   return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
751 }
752 
aom_paeth_predictor_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)753 void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
754                                    const uint8_t *above, const uint8_t *left) {
755   __m128i x = _mm_loadl_epi64((const __m128i *)left);
756   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
757   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
758   __m256i rep = _mm256_set1_epi16((short)0x8000);
759   const __m256i one = _mm256_set1_epi16(1);
760   const __m256i top = get_top_vector(above);
761 
762   int i;
763   for (i = 0; i < 8; ++i) {
764     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
765     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
766 
767     _mm_store_si128((__m128i *)dst, row);
768     dst += stride;
769     rep = _mm256_add_epi16(rep, one);
770   }
771 }
772 
get_left_vector(const uint8_t * left)773 static INLINE __m256i get_left_vector(const uint8_t *left) {
774   const __m128i x = _mm_load_si128((const __m128i *)left);
775   return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
776 }
777 
aom_paeth_predictor_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)778 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
779                                     const uint8_t *above, const uint8_t *left) {
780   const __m256i l = get_left_vector(left);
781   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
782   __m256i rep = _mm256_set1_epi16((short)0x8000);
783   const __m256i one = _mm256_set1_epi16(1);
784   const __m256i top = get_top_vector(above);
785 
786   int i;
787   for (i = 0; i < 16; ++i) {
788     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
789     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
790 
791     _mm_store_si128((__m128i *)dst, row);
792     dst += stride;
793     rep = _mm256_add_epi16(rep, one);
794   }
795 }
796 
aom_paeth_predictor_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)797 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
798                                     const uint8_t *above, const uint8_t *left) {
799   __m256i l = get_left_vector(left);
800   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
801   __m256i rep = _mm256_set1_epi16((short)0x8000);
802   const __m256i one = _mm256_set1_epi16(1);
803   const __m256i top = get_top_vector(above);
804 
805   int i;
806   for (i = 0; i < 16; ++i) {
807     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
808     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
809 
810     _mm_store_si128((__m128i *)dst, row);
811     dst += stride;
812     rep = _mm256_add_epi16(rep, one);
813   }
814 
815   l = get_left_vector(left + 16);
816   rep = _mm256_set1_epi16((short)0x8000);
817   for (i = 0; i < 16; ++i) {
818     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
819     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
820 
821     _mm_store_si128((__m128i *)dst, row);
822     dst += stride;
823     rep = _mm256_add_epi16(rep, one);
824   }
825 }
826 
aom_paeth_predictor_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)827 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
828                                     const uint8_t *above, const uint8_t *left) {
829   const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
830   const __m256i one = _mm256_set1_epi16(1);
831   const __m256i top = get_top_vector(above);
832 
833   for (int j = 0; j < 4; ++j) {
834     const __m256i l = get_left_vector(left + j * 16);
835     __m256i rep = _mm256_set1_epi16((short)0x8000);
836     for (int i = 0; i < 16; ++i) {
837       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
838       const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
839 
840       _mm_store_si128((__m128i *)dst, row);
841       dst += stride;
842       rep = _mm256_add_epi16(rep, one);
843     }
844   }
845 }
846 
847 // Return 32 8-bit pixels in one row (__m256i)
paeth_32x1_pred(const __m256i * left,const __m256i * top0,const __m256i * top1,const __m256i * topleft)848 static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
849                                       const __m256i *top1,
850                                       const __m256i *topleft) {
851   __m256i p0 = paeth_pred(left, top0, topleft);
852   __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
853   const __m256i x0 = _mm256_packus_epi16(p0, p1);
854 
855   p0 = paeth_pred(left, top1, topleft);
856   p1 = _mm256_permute4x64_epi64(p0, 0xe);
857   const __m256i x1 = _mm256_packus_epi16(p0, p1);
858 
859   return _mm256_permute2x128_si256(x0, x1, 0x20);
860 }
861 
aom_paeth_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)862 void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
863                                     const uint8_t *above, const uint8_t *left) {
864   const __m256i l = get_left_vector(left);
865   const __m256i t0 = get_top_vector(above);
866   const __m256i t1 = get_top_vector(above + 16);
867   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
868   __m256i rep = _mm256_set1_epi16((short)0x8000);
869   const __m256i one = _mm256_set1_epi16(1);
870 
871   int i;
872   for (i = 0; i < 16; ++i) {
873     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
874 
875     const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
876 
877     _mm256_storeu_si256((__m256i *)dst, r);
878 
879     dst += stride;
880     rep = _mm256_add_epi16(rep, one);
881   }
882 }
883 
aom_paeth_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)884 void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
885                                     const uint8_t *above, const uint8_t *left) {
886   __m256i l = get_left_vector(left);
887   const __m256i t0 = get_top_vector(above);
888   const __m256i t1 = get_top_vector(above + 16);
889   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
890   __m256i rep = _mm256_set1_epi16((short)0x8000);
891   const __m256i one = _mm256_set1_epi16(1);
892 
893   int i;
894   for (i = 0; i < 16; ++i) {
895     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
896 
897     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
898     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
899 
900     _mm_store_si128((__m128i *)dst, r0);
901     _mm_store_si128((__m128i *)(dst + 16), r1);
902 
903     dst += stride;
904     rep = _mm256_add_epi16(rep, one);
905   }
906 
907   l = get_left_vector(left + 16);
908   rep = _mm256_set1_epi16((short)0x8000);
909   for (i = 0; i < 16; ++i) {
910     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
911 
912     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
913     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
914 
915     _mm_store_si128((__m128i *)dst, r0);
916     _mm_store_si128((__m128i *)(dst + 16), r1);
917 
918     dst += stride;
919     rep = _mm256_add_epi16(rep, one);
920   }
921 }
922 
aom_paeth_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)923 void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
924                                     const uint8_t *above, const uint8_t *left) {
925   const __m256i t0 = get_top_vector(above);
926   const __m256i t1 = get_top_vector(above + 16);
927   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
928   const __m256i one = _mm256_set1_epi16(1);
929 
930   int i, j;
931   for (j = 0; j < 4; ++j) {
932     const __m256i l = get_left_vector(left + j * 16);
933     __m256i rep = _mm256_set1_epi16((short)0x8000);
934     for (i = 0; i < 16; ++i) {
935       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
936 
937       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
938       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
939 
940       _mm_store_si128((__m128i *)dst, r0);
941       _mm_store_si128((__m128i *)(dst + 16), r1);
942 
943       dst += stride;
944       rep = _mm256_add_epi16(rep, one);
945     }
946   }
947 }
948 
aom_paeth_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)949 void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
950                                     const uint8_t *above, const uint8_t *left) {
951   const __m256i t0 = get_top_vector(above);
952   const __m256i t1 = get_top_vector(above + 16);
953   const __m256i t2 = get_top_vector(above + 32);
954   const __m256i t3 = get_top_vector(above + 48);
955   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
956   const __m256i one = _mm256_set1_epi16(1);
957 
958   int i, j;
959   for (j = 0; j < 2; ++j) {
960     const __m256i l = get_left_vector(left + j * 16);
961     __m256i rep = _mm256_set1_epi16((short)0x8000);
962     for (i = 0; i < 16; ++i) {
963       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
964 
965       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
966       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
967       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
968       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
969 
970       _mm_store_si128((__m128i *)dst, r0);
971       _mm_store_si128((__m128i *)(dst + 16), r1);
972       _mm_store_si128((__m128i *)(dst + 32), r2);
973       _mm_store_si128((__m128i *)(dst + 48), r3);
974 
975       dst += stride;
976       rep = _mm256_add_epi16(rep, one);
977     }
978   }
979 }
980 
aom_paeth_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)981 void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
982                                     const uint8_t *above, const uint8_t *left) {
983   const __m256i t0 = get_top_vector(above);
984   const __m256i t1 = get_top_vector(above + 16);
985   const __m256i t2 = get_top_vector(above + 32);
986   const __m256i t3 = get_top_vector(above + 48);
987   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
988   const __m256i one = _mm256_set1_epi16(1);
989 
990   int i, j;
991   for (j = 0; j < 4; ++j) {
992     const __m256i l = get_left_vector(left + j * 16);
993     __m256i rep = _mm256_set1_epi16((short)0x8000);
994     for (i = 0; i < 16; ++i) {
995       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
996 
997       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
998       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
999       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1000       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1001 
1002       _mm_store_si128((__m128i *)dst, r0);
1003       _mm_store_si128((__m128i *)(dst + 16), r1);
1004       _mm_store_si128((__m128i *)(dst + 32), r2);
1005       _mm_store_si128((__m128i *)(dst + 48), r3);
1006 
1007       dst += stride;
1008       rep = _mm256_add_epi16(rep, one);
1009     }
1010   }
1011 }
1012 
aom_paeth_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1013 void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
1014                                     const uint8_t *above, const uint8_t *left) {
1015   const __m256i t0 = get_top_vector(above);
1016   const __m256i t1 = get_top_vector(above + 16);
1017   const __m256i t2 = get_top_vector(above + 32);
1018   const __m256i t3 = get_top_vector(above + 48);
1019   const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1020   const __m256i one = _mm256_set1_epi16(1);
1021 
1022   int i;
1023   const __m256i l = get_left_vector(left);
1024   __m256i rep = _mm256_set1_epi16((short)0x8000);
1025   for (i = 0; i < 16; ++i) {
1026     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1027 
1028     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1029     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1030     const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1031     const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1032 
1033     _mm_store_si128((__m128i *)dst, r0);
1034     _mm_store_si128((__m128i *)(dst + 16), r1);
1035     _mm_store_si128((__m128i *)(dst + 32), r2);
1036     _mm_store_si128((__m128i *)(dst + 48), r3);
1037 
1038     dst += stride;
1039     rep = _mm256_add_epi16(rep, one);
1040   }
1041 }
1042 
1043 #define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
1044 #define PERM2x128(c0, c1) c0 + (c1 << 4)
1045 
highbd_dr_prediction_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1046 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1047     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1048   const int frac_bits = 6 - upsample_above;
1049   const int max_base_x = ((N + 4) - 1) << upsample_above;
1050 
1051   assert(dx > 0);
1052   // pre-filter above pixels
1053   // store in temp buffers:
1054   //   above[x] * 32 + 16
1055   //   above[x+1] - above[x]
1056   // final pixels will be calculated as:
1057   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1058   __m256i a0, a1, a32, a16;
1059   __m256i diff, c3f;
1060   __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1061   __m128i a0_128, a1_128;
1062   a16 = _mm256_set1_epi16(16);
1063   a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1064   max_base_x128 = _mm_set1_epi16(max_base_x);
1065   c3f = _mm256_set1_epi16(0x3f);
1066 
1067   int x = dx;
1068   for (int r = 0; r < N; r++) {
1069     __m256i b, res, shift;
1070     __m128i res1;
1071 
1072     int base = x >> frac_bits;
1073     if (base >= max_base_x) {
1074       for (int i = r; i < N; ++i) {
1075         dst[i] = a_mbase_x;  // save 4 values
1076       }
1077       return;
1078     }
1079 
1080     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1081     a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1082 
1083     if (upsample_above) {
1084       a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
1085       a1_128 = _mm_srli_si128(a0_128, 8);
1086 
1087       base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
1088                                    base + 10, base + 12, base + 14);
1089       shift = _mm256_srli_epi16(
1090           _mm256_and_si256(
1091               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
1092               _mm256_set1_epi16(0x3f)),
1093           1);
1094     } else {
1095       base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
1096                                    base + 5, base + 6, base + 7);
1097       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1098     }
1099     a0 = _mm256_castsi128_si256(a0_128);
1100     a1 = _mm256_castsi128_si256(a1_128);
1101     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1102     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1103     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1104 
1105     b = _mm256_mullo_epi16(diff, shift);
1106     res = _mm256_add_epi16(a32, b);
1107     res = _mm256_srli_epi16(res, 5);
1108     res1 = _mm256_castsi256_si128(res);
1109 
1110     mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
1111     dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1112     x += dx;
1113   }
1114 }
1115 
highbd_dr_prediction_32bit_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1116 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
1117     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1118   const int frac_bits = 6 - upsample_above;
1119   const int max_base_x = ((N + 4) - 1) << upsample_above;
1120 
1121   assert(dx > 0);
1122   // pre-filter above pixels
1123   // store in temp buffers:
1124   //   above[x] * 32 + 16
1125   //   above[x+1] - above[x]
1126   // final pixels will be calculated as:
1127   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1128   __m256i a0, a1, a32, a16;
1129   __m256i diff;
1130   __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1131 
1132   a16 = _mm256_set1_epi32(16);
1133   a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1134   max_base_x128 = _mm_set1_epi32(max_base_x);
1135 
1136   int x = dx;
1137   for (int r = 0; r < N; r++) {
1138     __m256i b, res, shift;
1139     __m128i res1;
1140 
1141     int base = x >> frac_bits;
1142     if (base >= max_base_x) {
1143       for (int i = r; i < N; ++i) {
1144         dst[i] = a_mbase_x;  // save 4 values
1145       }
1146       return;
1147     }
1148 
1149     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1150     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1151 
1152     if (upsample_above) {
1153       a0 = _mm256_permutevar8x32_epi32(
1154           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1155       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1156       base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1157       shift = _mm256_srli_epi32(
1158           _mm256_and_si256(
1159               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1160               _mm256_set1_epi32(0x3f)),
1161           1);
1162     } else {
1163       base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1164       shift = _mm256_srli_epi32(
1165           _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1166     }
1167 
1168     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1169     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1170     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1171 
1172     b = _mm256_mullo_epi32(diff, shift);
1173     res = _mm256_add_epi32(a32, b);
1174     res = _mm256_srli_epi32(res, 5);
1175 
1176     res1 = _mm256_castsi256_si128(res);
1177     res1 = _mm_packus_epi32(res1, res1);
1178 
1179     mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1180     mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
1181     dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1182     x += dx;
1183   }
1184 }
1185 
highbd_dr_prediction_z1_4xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1186 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
1187                                              ptrdiff_t stride,
1188                                              const uint16_t *above,
1189                                              int upsample_above, int dx,
1190                                              int bd) {
1191   __m128i dstvec[16];
1192   if (bd < 12) {
1193     highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1194                                               dx);
1195   } else {
1196     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
1197                                                     upsample_above, dx);
1198   }
1199   for (int i = 0; i < N; i++) {
1200     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1201   }
1202 }
1203 
highbd_dr_prediction_32bit_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1204 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
1205     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1206   const int frac_bits = 6 - upsample_above;
1207   const int max_base_x = ((8 + N) - 1) << upsample_above;
1208 
1209   assert(dx > 0);
1210   // pre-filter above pixels
1211   // store in temp buffers:
1212   //   above[x] * 32 + 16
1213   //   above[x+1] - above[x]
1214   // final pixels will be calculated as:
1215   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1216   __m256i a0, a1, a0_1, a1_1, a32, a16;
1217   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1218 
1219   a16 = _mm256_set1_epi32(16);
1220   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1221   max_base_x256 = _mm256_set1_epi32(max_base_x);
1222 
1223   int x = dx;
1224   for (int r = 0; r < N; r++) {
1225     __m256i b, res, res1, shift;
1226 
1227     int base = x >> frac_bits;
1228     if (base >= max_base_x) {
1229       for (int i = r; i < N; ++i) {
1230         dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1231       }
1232       return;
1233     }
1234 
1235     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1236     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1237 
1238     if (upsample_above) {
1239       a0 = _mm256_permutevar8x32_epi32(
1240           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1241       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1242 
1243       a0_1 =
1244           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1245       a0_1 = _mm256_permutevar8x32_epi32(
1246           a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1247       a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1248 
1249       a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1250       a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1251       base_inc256 =
1252           _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1253                             base + 10, base + 12, base + 14);
1254       shift = _mm256_srli_epi32(
1255           _mm256_and_si256(
1256               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1257               _mm256_set1_epi32(0x3f)),
1258           1);
1259     } else {
1260       base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1261                                       base + 4, base + 5, base + 6, base + 7);
1262       shift = _mm256_srli_epi32(
1263           _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1264     }
1265 
1266     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1267     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1268     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1269 
1270     b = _mm256_mullo_epi32(diff, shift);
1271     res = _mm256_add_epi32(a32, b);
1272     res = _mm256_srli_epi32(res, 5);
1273 
1274     res1 = _mm256_packus_epi32(
1275         res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1276 
1277     mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1278     mask256 = _mm256_packs_epi32(
1279         mask256, _mm256_castsi128_si256(
1280                      _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
1281     res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1282     dst[r] = _mm256_castsi256_si128(res1);
1283     x += dx;
1284   }
1285 }
1286 
highbd_dr_prediction_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1287 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1288     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1289   const int frac_bits = 6 - upsample_above;
1290   const int max_base_x = ((8 + N) - 1) << upsample_above;
1291 
1292   assert(dx > 0);
1293   // pre-filter above pixels
1294   // store in temp buffers:
1295   //   above[x] * 32 + 16
1296   //   above[x+1] - above[x]
1297   // final pixels will be calculated as:
1298   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1299   __m256i a0, a1, a32, a16, c3f;
1300   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1301   __m128i a0_x128, a1_x128;
1302 
1303   a16 = _mm256_set1_epi16(16);
1304   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1305   max_base_x256 = _mm256_set1_epi16(max_base_x);
1306   c3f = _mm256_set1_epi16(0x3f);
1307 
1308   int x = dx;
1309   for (int r = 0; r < N; r++) {
1310     __m256i b, res, res1, shift;
1311 
1312     int base = x >> frac_bits;
1313     if (base >= max_base_x) {
1314       for (int i = r; i < N; ++i) {
1315         dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
1316       }
1317       return;
1318     }
1319 
1320     a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
1321     if (upsample_above) {
1322       __m128i mask, atmp0, atmp1, atmp2, atmp3;
1323       a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
1324       atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1325       atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1326       atmp2 =
1327           _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1328       atmp3 =
1329           _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1330       mask =
1331           _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
1332       a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
1333       mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
1334                             _mm_set1_epi8(15));
1335       a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
1336 
1337       base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
1338                                       base + 8, base + 10, base + 12, base + 14,
1339                                       0, 0, 0, 0, 0, 0, 0, 0);
1340       shift = _mm256_srli_epi16(
1341           _mm256_and_si256(
1342               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1343           1);
1344     } else {
1345       a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1346       base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1347                                       base + 4, base + 5, base + 6, base + 7, 0,
1348                                       0, 0, 0, 0, 0, 0, 0);
1349       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1350     }
1351     a0 = _mm256_castsi128_si256(a0_x128);
1352     a1 = _mm256_castsi128_si256(a1_x128);
1353 
1354     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1355     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1356     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1357 
1358     b = _mm256_mullo_epi16(diff, shift);
1359     res = _mm256_add_epi16(a32, b);
1360     res = _mm256_srli_epi16(res, 5);
1361 
1362     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1363     res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1364     dst[r] = _mm256_castsi256_si128(res1);
1365     x += dx;
1366   }
1367 }
1368 
highbd_dr_prediction_z1_8xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1369 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
1370                                              ptrdiff_t stride,
1371                                              const uint16_t *above,
1372                                              int upsample_above, int dx,
1373                                              int bd) {
1374   __m128i dstvec[32];
1375   if (bd < 12) {
1376     highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1377                                               dx);
1378   } else {
1379     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
1380                                                     upsample_above, dx);
1381   }
1382   for (int i = 0; i < N; i++) {
1383     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1384   }
1385 }
1386 
highbd_dr_prediction_32bit_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1387 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
1388     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1389   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1390   (void)upsample_above;
1391   const int frac_bits = 6;
1392   const int max_base_x = ((16 + N) - 1);
1393 
1394   // pre-filter above pixels
1395   // store in temp buffers:
1396   //   above[x] * 32 + 16
1397   //   above[x+1] - above[x]
1398   // final pixels will be calculated as:
1399   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1400   __m256i a0, a0_1, a1, a1_1, a32, a16;
1401   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1402 
1403   a16 = _mm256_set1_epi32(16);
1404   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1405   max_base_x256 = _mm256_set1_epi16(max_base_x);
1406 
1407   int x = dx;
1408   for (int r = 0; r < N; r++) {
1409     __m256i b, res[2], res1;
1410 
1411     int base = x >> frac_bits;
1412     if (base >= max_base_x) {
1413       for (int i = r; i < N; ++i) {
1414         dstvec[i] = a_mbase_x;  // save 16 values
1415       }
1416       return;
1417     }
1418     __m256i shift = _mm256_srli_epi32(
1419         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1420 
1421     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1422     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1423 
1424     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1425     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1426     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1427     b = _mm256_mullo_epi32(diff, shift);
1428 
1429     res[0] = _mm256_add_epi32(a32, b);
1430     res[0] = _mm256_srli_epi32(res[0], 5);
1431     res[0] = _mm256_packus_epi32(
1432         res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1433 
1434     int mdif = max_base_x - base;
1435     if (mdif > 8) {
1436       a0_1 =
1437           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1438       a1_1 =
1439           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1440 
1441       diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1442       a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1443       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1444       b = _mm256_mullo_epi32(diff, shift);
1445 
1446       res[1] = _mm256_add_epi32(a32, b);
1447       res[1] = _mm256_srli_epi32(res[1], 5);
1448       res[1] = _mm256_packus_epi32(
1449           res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1450     } else {
1451       res[1] = a_mbase_x;
1452     }
1453     res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1454                                    1);  // 16 16bit values
1455 
1456     base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1457                                     base + 4, base + 5, base + 6, base + 7,
1458                                     base + 8, base + 9, base + 10, base + 11,
1459                                     base + 12, base + 13, base + 14, base + 15);
1460     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1461     dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1462     x += dx;
1463   }
1464 }
1465 
highbd_dr_prediction_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1466 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1467     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1468   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1469   (void)upsample_above;
1470   const int frac_bits = 6;
1471   const int max_base_x = ((16 + N) - 1);
1472 
1473   // pre-filter above pixels
1474   // store in temp buffers:
1475   //   above[x] * 32 + 16
1476   //   above[x+1] - above[x]
1477   // final pixels will be calculated as:
1478   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1479   __m256i a0, a1, a32, a16, c3f;
1480   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1481 
1482   a16 = _mm256_set1_epi16(16);
1483   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1484   max_base_x256 = _mm256_set1_epi16(max_base_x);
1485   c3f = _mm256_set1_epi16(0x3f);
1486 
1487   int x = dx;
1488   for (int r = 0; r < N; r++) {
1489     __m256i b, res;
1490 
1491     int base = x >> frac_bits;
1492     if (base >= max_base_x) {
1493       for (int i = r; i < N; ++i) {
1494         dstvec[i] = a_mbase_x;  // save 16 values
1495       }
1496       return;
1497     }
1498     __m256i shift =
1499         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1500 
1501     a0 = _mm256_loadu_si256((__m256i *)(above + base));
1502     a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
1503 
1504     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1505     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1506     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1507     b = _mm256_mullo_epi16(diff, shift);
1508 
1509     res = _mm256_add_epi16(a32, b);
1510     res = _mm256_srli_epi16(res, 5);  // 16 16bit values
1511 
1512     base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1513                                     base + 4, base + 5, base + 6, base + 7,
1514                                     base + 8, base + 9, base + 10, base + 11,
1515                                     base + 12, base + 13, base + 14, base + 15);
1516     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1517     dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1518     x += dx;
1519   }
1520 }
1521 
highbd_dr_prediction_z1_16xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1522 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
1523                                               ptrdiff_t stride,
1524                                               const uint16_t *above,
1525                                               int upsample_above, int dx,
1526                                               int bd) {
1527   __m256i dstvec[64];
1528   if (bd < 12) {
1529     highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1530                                                dx);
1531   } else {
1532     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
1533                                                      upsample_above, dx);
1534   }
1535   for (int i = 0; i < N; i++) {
1536     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1537   }
1538 }
1539 
highbd_dr_prediction_32bit_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1540 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
1541     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1542   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1543   (void)upsample_above;
1544   const int frac_bits = 6;
1545   const int max_base_x = ((32 + N) - 1);
1546 
1547   // pre-filter above pixels
1548   // store in temp buffers:
1549   //   above[x] * 32 + 16
1550   //   above[x+1] - above[x]
1551   // final pixels will be calculated as:
1552   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1553   __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
1554   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1555 
1556   a16 = _mm256_set1_epi32(16);
1557   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1558   max_base_x256 = _mm256_set1_epi16(max_base_x);
1559   c3f = _mm256_set1_epi16(0x3f);
1560 
1561   int x = dx;
1562   for (int r = 0; r < N; r++) {
1563     __m256i b, res[2], res1;
1564 
1565     int base = x >> frac_bits;
1566     if (base >= max_base_x) {
1567       for (int i = r; i < N; ++i) {
1568         dstvec[i] = a_mbase_x;  // save 32 values
1569         dstvec[i + N] = a_mbase_x;
1570       }
1571       return;
1572     }
1573 
1574     __m256i shift =
1575         _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
1576 
1577     for (int j = 0; j < 32; j += 16) {
1578       int mdif = max_base_x - (base + j);
1579       if (mdif <= 0) {
1580         res1 = a_mbase_x;
1581       } else {
1582         a0 = _mm256_cvtepu16_epi32(
1583             _mm_loadu_si128((__m128i *)(above + base + j)));
1584         a1 = _mm256_cvtepu16_epi32(
1585             _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1586 
1587         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1588         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1589         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1590         b = _mm256_mullo_epi32(diff, shift);
1591 
1592         res[0] = _mm256_add_epi32(a32, b);
1593         res[0] = _mm256_srli_epi32(res[0], 5);
1594         res[0] = _mm256_packus_epi32(
1595             res[0],
1596             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1597         if (mdif > 8) {
1598           a0_1 = _mm256_cvtepu16_epi32(
1599               _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1600           a1_1 = _mm256_cvtepu16_epi32(
1601               _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1602 
1603           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1604           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1605           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1606           b = _mm256_mullo_epi32(diff, shift);
1607 
1608           res[1] = _mm256_add_epi32(a32, b);
1609           res[1] = _mm256_srli_epi32(res[1], 5);
1610           res[1] = _mm256_packus_epi32(
1611               res[1],
1612               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1613         } else {
1614           res[1] = a_mbase_x;
1615         }
1616         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1617                                        1);  // 16 16bit values
1618         base_inc256 = _mm256_setr_epi16(
1619             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1620             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1621             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1622             base + j + 13, base + j + 14, base + j + 15);
1623 
1624         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1625         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1626       }
1627       if (!j) {
1628         dstvec[r] = res1;
1629       } else {
1630         dstvec[r + N] = res1;
1631       }
1632     }
1633     x += dx;
1634   }
1635 }
1636 
highbd_dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1637 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1638     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1639   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1640   (void)upsample_above;
1641   const int frac_bits = 6;
1642   const int max_base_x = ((32 + N) - 1);
1643 
1644   // pre-filter above pixels
1645   // store in temp buffers:
1646   //   above[x] * 32 + 16
1647   //   above[x+1] - above[x]
1648   // final pixels will be calculated as:
1649   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1650   __m256i a0, a1, a32, a16, c3f;
1651   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1652 
1653   a16 = _mm256_set1_epi16(16);
1654   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1655   max_base_x256 = _mm256_set1_epi16(max_base_x);
1656   c3f = _mm256_set1_epi16(0x3f);
1657 
1658   int x = dx;
1659   for (int r = 0; r < N; r++) {
1660     __m256i b, res;
1661 
1662     int base = x >> frac_bits;
1663     if (base >= max_base_x) {
1664       for (int i = r; i < N; ++i) {
1665         dstvec[i] = a_mbase_x;  // save 32 values
1666         dstvec[i + N] = a_mbase_x;
1667       }
1668       return;
1669     }
1670 
1671     __m256i shift =
1672         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1673 
1674     for (int j = 0; j < 32; j += 16) {
1675       int mdif = max_base_x - (base + j);
1676       if (mdif <= 0) {
1677         res = a_mbase_x;
1678       } else {
1679         a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1680         a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1681 
1682         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1683         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1684         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1685         b = _mm256_mullo_epi16(diff, shift);
1686 
1687         res = _mm256_add_epi16(a32, b);
1688         res = _mm256_srli_epi16(res, 5);
1689 
1690         base_inc256 = _mm256_setr_epi16(
1691             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1692             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1693             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1694             base + j + 13, base + j + 14, base + j + 15);
1695 
1696         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1697         res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1698       }
1699       if (!j) {
1700         dstvec[r] = res;
1701       } else {
1702         dstvec[r + N] = res;
1703       }
1704     }
1705     x += dx;
1706   }
1707 }
1708 
highbd_dr_prediction_z1_32xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1709 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
1710                                               ptrdiff_t stride,
1711                                               const uint16_t *above,
1712                                               int upsample_above, int dx,
1713                                               int bd) {
1714   __m256i dstvec[128];
1715   if (bd < 12) {
1716     highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1717                                                dx);
1718   } else {
1719     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
1720                                                      upsample_above, dx);
1721   }
1722   for (int i = 0; i < N; i++) {
1723     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1724     _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1725   }
1726 }
1727 
highbd_dr_prediction_32bit_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1728 static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
1729                                                     ptrdiff_t stride,
1730                                                     const uint16_t *above,
1731                                                     int upsample_above,
1732                                                     int dx) {
1733   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1734   (void)upsample_above;
1735   const int frac_bits = 6;
1736   const int max_base_x = ((64 + N) - 1);
1737 
1738   // pre-filter above pixels
1739   // store in temp buffers:
1740   //   above[x] * 32 + 16
1741   //   above[x+1] - above[x]
1742   // final pixels will be calculated as:
1743   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1744   __m256i a0, a0_1, a1, a1_1, a32, a16;
1745   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1746 
1747   a16 = _mm256_set1_epi32(16);
1748   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1749   max_base_x256 = _mm256_set1_epi16(max_base_x);
1750 
1751   int x = dx;
1752   for (int r = 0; r < N; r++, dst += stride) {
1753     __m256i b, res[2], res1;
1754 
1755     int base = x >> frac_bits;
1756     if (base >= max_base_x) {
1757       for (int i = r; i < N; ++i) {
1758         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1759         _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1760         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1761         _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1762         dst += stride;
1763       }
1764       return;
1765     }
1766 
1767     __m256i shift = _mm256_srli_epi32(
1768         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1769 
1770     __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1771     for (int j = 0; j < 64; j += 16) {
1772       int mdif = max_base_x - (base + j);
1773       if (mdif <= 0) {
1774         _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1775       } else {
1776         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1777         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1778         a0 = _mm256_cvtepu16_epi32(a0_128);
1779         a1 = _mm256_cvtepu16_epi32(a1_128);
1780 
1781         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
1782         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
1783         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
1784         b = _mm256_mullo_epi32(diff, shift);
1785 
1786         res[0] = _mm256_add_epi32(a32, b);
1787         res[0] = _mm256_srli_epi32(res[0], 5);
1788         res[0] = _mm256_packus_epi32(
1789             res[0],
1790             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1791         if (mdif > 8) {
1792           a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1793           a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1794           a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1795           a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1796 
1797           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
1798           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
1799           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
1800           b = _mm256_mullo_epi32(diff, shift);
1801 
1802           res[1] = _mm256_add_epi32(a32, b);
1803           res[1] = _mm256_srli_epi32(res[1], 5);
1804           res[1] = _mm256_packus_epi32(
1805               res[1],
1806               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1807         } else {
1808           res[1] = a_mbase_x;
1809         }
1810         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1811                                        1);  // 16 16bit values
1812         base_inc256 = _mm256_setr_epi16(
1813             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1814             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1815             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1816             base + j + 13, base + j + 14, base + j + 15);
1817 
1818         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1819         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1820         _mm256_storeu_si256((__m256i *)(dst + j), res1);
1821       }
1822     }
1823     x += dx;
1824   }
1825 }
1826 
highbd_dr_prediction_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1827 static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
1828                                               ptrdiff_t stride,
1829                                               const uint16_t *above,
1830                                               int upsample_above, int dx) {
1831   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1832   (void)upsample_above;
1833   const int frac_bits = 6;
1834   const int max_base_x = ((64 + N) - 1);
1835 
1836   // pre-filter above pixels
1837   // store in temp buffers:
1838   //   above[x] * 32 + 16
1839   //   above[x+1] - above[x]
1840   // final pixels will be calculated as:
1841   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1842   __m256i a0, a1, a32, a16, c3f;
1843   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1844 
1845   a16 = _mm256_set1_epi16(16);
1846   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1847   max_base_x256 = _mm256_set1_epi16(max_base_x);
1848   c3f = _mm256_set1_epi16(0x3f);
1849 
1850   int x = dx;
1851   for (int r = 0; r < N; r++, dst += stride) {
1852     __m256i b, res;
1853 
1854     int base = x >> frac_bits;
1855     if (base >= max_base_x) {
1856       for (int i = r; i < N; ++i) {
1857         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
1858         _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1859         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1860         _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1861         dst += stride;
1862       }
1863       return;
1864     }
1865 
1866     __m256i shift =
1867         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1868 
1869     for (int j = 0; j < 64; j += 16) {
1870       int mdif = max_base_x - (base + j);
1871       if (mdif <= 0) {
1872         _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1873       } else {
1874         a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1875         a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1876 
1877         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
1878         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
1879         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
1880         b = _mm256_mullo_epi16(diff, shift);
1881 
1882         res = _mm256_add_epi16(a32, b);
1883         res = _mm256_srli_epi16(res, 5);
1884 
1885         base_inc256 = _mm256_setr_epi16(
1886             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1887             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1888             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1889             base + j + 13, base + j + 14, base + j + 15);
1890 
1891         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1892         res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1893         _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
1894       }
1895     }
1896     x += dx;
1897   }
1898 }
1899 
1900 // Directional prediction, zone 1: 0 < angle < 90
av1_highbd_dr_prediction_z1_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int dx,int dy,int bd)1901 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
1902                                       int bh, const uint16_t *above,
1903                                       const uint16_t *left, int upsample_above,
1904                                       int dx, int dy, int bd) {
1905   (void)left;
1906   (void)dy;
1907 
1908   switch (bw) {
1909     case 4:
1910       highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1911                                        dx, bd);
1912       break;
1913     case 8:
1914       highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1915                                        dx, bd);
1916       break;
1917     case 16:
1918       highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1919                                         dx, bd);
1920       break;
1921     case 32:
1922       highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1923                                         dx, bd);
1924       break;
1925     case 64:
1926       if (bd < 12) {
1927         highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
1928                                           upsample_above, dx);
1929       } else {
1930         highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
1931                                                 upsample_above, dx);
1932       }
1933       break;
1934     default: break;
1935   }
1936   return;
1937 }
1938 
highbd_transpose_TX_16X16(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst)1939 static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
1940                                       uint16_t *dst, ptrdiff_t pitchDst) {
1941   __m256i r[16];
1942   __m256i d[16];
1943   for (int j = 0; j < 16; j++) {
1944     r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
1945   }
1946   highbd_transpose16x16_avx2(r, d);
1947   for (int j = 0; j < 16; j++) {
1948     _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
1949   }
1950 }
1951 
highbd_transpose(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst,int width,int height)1952 static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
1953                              uint16_t *dst, ptrdiff_t pitchDst, int width,
1954                              int height) {
1955   for (int j = 0; j < height; j += 16)
1956     for (int i = 0; i < width; i += 16)
1957       highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
1958                                 dst + j * pitchDst + i, pitchDst);
1959 }
1960 
highbd_dr_prediction_32bit_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)1961 static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
1962     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1963     const uint16_t *left, int upsample_above, int upsample_left, int dx,
1964     int dy) {
1965   const int min_base_x = -(1 << upsample_above);
1966   const int min_base_y = -(1 << upsample_left);
1967   const int frac_bits_x = 6 - upsample_above;
1968   const int frac_bits_y = 6 - upsample_left;
1969 
1970   assert(dx > 0);
1971   // pre-filter above pixels
1972   // store in temp buffers:
1973   //   above[x] * 32 + 16
1974   //   above[x+1] - above[x]
1975   // final pixels will be calculated as:
1976   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1977   __m256i a0_x, a1_x, a32, a16;
1978   __m256i diff;
1979   __m128i c3f, min_base_y128;
1980 
1981   a16 = _mm256_set1_epi32(16);
1982   c3f = _mm_set1_epi32(0x3f);
1983   min_base_y128 = _mm_set1_epi32(min_base_y);
1984 
1985   for (int r = 0; r < N; r++) {
1986     __m256i b, res, shift;
1987     __m128i resx, resy, resxy;
1988     __m128i a0_x128, a1_x128;
1989     int y = r + 1;
1990     int base_x = (-y * dx) >> frac_bits_x;
1991     int base_shift = 0;
1992     if (base_x < (min_base_x - 1)) {
1993       base_shift = (min_base_x - base_x - 1) >> upsample_above;
1994     }
1995     int base_min_diff =
1996         (min_base_x - base_x + upsample_above) >> upsample_above;
1997     if (base_min_diff > 4) {
1998       base_min_diff = 4;
1999     } else {
2000       if (base_min_diff < 0) base_min_diff = 0;
2001     }
2002 
2003     if (base_shift > 3) {
2004       a0_x = _mm256_setzero_si256();
2005       a1_x = _mm256_setzero_si256();
2006       shift = _mm256_setzero_si256();
2007     } else {
2008       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2009       if (upsample_above) {
2010         a0_x128 = _mm_shuffle_epi8(a0_x128,
2011                                    *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2012         a1_x128 = _mm_srli_si128(a0_x128, 8);
2013 
2014         shift = _mm256_castsi128_si256(_mm_srli_epi32(
2015             _mm_and_si128(
2016                 _mm_slli_epi32(
2017                     _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2018                                    (2 << 6) - y * dx, (3 << 6) - y * dx),
2019                     upsample_above),
2020                 c3f),
2021             1));
2022       } else {
2023         a0_x128 =
2024             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2025         a1_x128 = _mm_srli_si128(a0_x128, 2);
2026 
2027         shift = _mm256_castsi128_si256(_mm_srli_epi32(
2028             _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2029                                          (2 << 6) - y * dx, (3 << 6) - y * dx),
2030                           c3f),
2031             1));
2032       }
2033       a0_x = _mm256_cvtepu16_epi32(a0_x128);
2034       a1_x = _mm256_cvtepu16_epi32(a1_x128);
2035     }
2036     // y calc
2037     __m128i a0_y, a1_y, shifty;
2038     if (base_x < min_base_x) {
2039       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2040       DECLARE_ALIGNED(32, int, base_y_c[4]);
2041       r6 = _mm_set1_epi32(r << 6);
2042       dy128 = _mm_set1_epi32(dy);
2043       c1234 = _mm_setr_epi32(1, 2, 3, 4);
2044       y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
2045       base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
2046       mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
2047       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2048       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2049 
2050       a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
2051                             left[base_y_c[2]], left[base_y_c[3]]);
2052       a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2053                             left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
2054 
2055       if (upsample_left) {
2056         shifty = _mm_srli_epi32(
2057             _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
2058       } else {
2059         shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
2060       }
2061       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2062       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2063       shift = _mm256_inserti128_si256(shift, shifty, 1);
2064     }
2065 
2066     diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2067     a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2068     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2069 
2070     b = _mm256_mullo_epi32(diff, shift);
2071     res = _mm256_add_epi32(a32, b);
2072     res = _mm256_srli_epi32(res, 5);
2073 
2074     resx = _mm256_castsi256_si128(res);
2075     resx = _mm_packus_epi32(resx, resx);
2076 
2077     resy = _mm256_extracti128_si256(res, 1);
2078     resy = _mm_packus_epi32(resy, resy);
2079 
2080     resxy =
2081         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2082     _mm_storel_epi64((__m128i *)(dst), resxy);
2083     dst += stride;
2084   }
2085 }
2086 
highbd_dr_prediction_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2087 static void highbd_dr_prediction_z2_Nx4_avx2(
2088     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2089     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2090     int dy) {
2091   const int min_base_x = -(1 << upsample_above);
2092   const int min_base_y = -(1 << upsample_left);
2093   const int frac_bits_x = 6 - upsample_above;
2094   const int frac_bits_y = 6 - upsample_left;
2095 
2096   assert(dx > 0);
2097   // pre-filter above pixels
2098   // store in temp buffers:
2099   //   above[x] * 32 + 16
2100   //   above[x+1] - above[x]
2101   // final pixels will be calculated as:
2102   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2103   __m256i a0_x, a1_x, a32, a16;
2104   __m256i diff;
2105   __m128i c3f, min_base_y128;
2106 
2107   a16 = _mm256_set1_epi16(16);
2108   c3f = _mm_set1_epi16(0x3f);
2109   min_base_y128 = _mm_set1_epi16(min_base_y);
2110 
2111   for (int r = 0; r < N; r++) {
2112     __m256i b, res, shift;
2113     __m128i resx, resy, resxy;
2114     __m128i a0_x128, a1_x128;
2115     int y = r + 1;
2116     int base_x = (-y * dx) >> frac_bits_x;
2117     int base_shift = 0;
2118     if (base_x < (min_base_x - 1)) {
2119       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2120     }
2121     int base_min_diff =
2122         (min_base_x - base_x + upsample_above) >> upsample_above;
2123     if (base_min_diff > 4) {
2124       base_min_diff = 4;
2125     } else {
2126       if (base_min_diff < 0) base_min_diff = 0;
2127     }
2128 
2129     if (base_shift > 3) {
2130       a0_x = _mm256_setzero_si256();
2131       a1_x = _mm256_setzero_si256();
2132       shift = _mm256_setzero_si256();
2133     } else {
2134       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2135       if (upsample_above) {
2136         a0_x128 = _mm_shuffle_epi8(a0_x128,
2137                                    *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2138         a1_x128 = _mm_srli_si128(a0_x128, 8);
2139 
2140         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2141             _mm_and_si128(
2142                 _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2143                                               (2 << 6) - y * dx,
2144                                               (3 << 6) - y * dx, 0, 0, 0, 0),
2145                                upsample_above),
2146                 c3f),
2147             1));
2148       } else {
2149         a0_x128 =
2150             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2151         a1_x128 = _mm_srli_si128(a0_x128, 2);
2152 
2153         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2154             _mm_and_si128(
2155                 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2156                                (3 << 6) - y * dx, 0, 0, 0, 0),
2157                 c3f),
2158             1));
2159       }
2160       a0_x = _mm256_castsi128_si256(a0_x128);
2161       a1_x = _mm256_castsi128_si256(a1_x128);
2162     }
2163     // y calc
2164     __m128i a0_y, a1_y, shifty;
2165     if (base_x < min_base_x) {
2166       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2167       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2168       r6 = _mm_set1_epi16(r << 6);
2169       dy128 = _mm_set1_epi16(dy);
2170       c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
2171       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2172       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2173       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2174       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2175       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2176 
2177       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2178                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2179       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2180                             left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
2181                             0, 0);
2182 
2183       if (upsample_left) {
2184         shifty = _mm_srli_epi16(
2185             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2186       } else {
2187         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2188       }
2189       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2190       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2191       shift = _mm256_inserti128_si256(shift, shifty, 1);
2192     }
2193 
2194     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2195     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2196     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2197 
2198     b = _mm256_mullo_epi16(diff, shift);
2199     res = _mm256_add_epi16(a32, b);
2200     res = _mm256_srli_epi16(res, 5);
2201 
2202     resx = _mm256_castsi256_si128(res);
2203     resy = _mm256_extracti128_si256(res, 1);
2204     resxy =
2205         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2206     _mm_storel_epi64((__m128i *)(dst), resxy);
2207     dst += stride;
2208   }
2209 }
2210 
highbd_dr_prediction_32bit_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2211 static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
2212     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2213     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2214     int dy) {
2215   const int min_base_x = -(1 << upsample_above);
2216   const int min_base_y = -(1 << upsample_left);
2217   const int frac_bits_x = 6 - upsample_above;
2218   const int frac_bits_y = 6 - upsample_left;
2219 
2220   // pre-filter above pixels
2221   // store in temp buffers:
2222   //   above[x] * 32 + 16
2223   //   above[x+1] - above[x]
2224   // final pixels will be calculated as:
2225   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2226   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
2227   __m256i diff;
2228   __m128i a0_x128, a1_x128;
2229 
2230   a16 = _mm256_set1_epi32(16);
2231   c3f = _mm256_set1_epi32(0x3f);
2232   min_base_y256 = _mm256_set1_epi32(min_base_y);
2233 
2234   for (int r = 0; r < N; r++) {
2235     __m256i b, res, shift;
2236     __m128i resx, resy, resxy;
2237     int y = r + 1;
2238     int base_x = (-y * dx) >> frac_bits_x;
2239     int base_shift = 0;
2240     if (base_x < (min_base_x - 1)) {
2241       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2242     }
2243     int base_min_diff =
2244         (min_base_x - base_x + upsample_above) >> upsample_above;
2245     if (base_min_diff > 8) {
2246       base_min_diff = 8;
2247     } else {
2248       if (base_min_diff < 0) base_min_diff = 0;
2249     }
2250 
2251     if (base_shift > 7) {
2252       resx = _mm_setzero_si128();
2253     } else {
2254       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2255       if (upsample_above) {
2256         __m128i mask, atmp0, atmp1, atmp2, atmp3;
2257         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2258         atmp0 = _mm_shuffle_epi8(a0_x128,
2259                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2260         atmp1 = _mm_shuffle_epi8(a1_x128,
2261                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2262         atmp2 = _mm_shuffle_epi8(
2263             a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2264         atmp3 = _mm_shuffle_epi8(
2265             a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2266         mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2267                               _mm_set1_epi8(15));
2268         a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2269         mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2270                               _mm_set1_epi8(15));
2271         a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2272         shift = _mm256_srli_epi32(
2273             _mm256_and_si256(
2274                 _mm256_slli_epi32(
2275                     _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
2276                                       (2 << 6) - y * dx, (3 << 6) - y * dx,
2277                                       (4 << 6) - y * dx, (5 << 6) - y * dx,
2278                                       (6 << 6) - y * dx, (7 << 6) - y * dx),
2279                     upsample_above),
2280                 c3f),
2281             1);
2282       } else {
2283         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2284         a0_x128 =
2285             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2286         a1_x128 =
2287             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2288 
2289         shift = _mm256_srli_epi32(
2290             _mm256_and_si256(
2291                 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2292                                   (3 << 6) - y * dx, (4 << 6) - y * dx,
2293                                   (5 << 6) - y * dx, (6 << 6) - y * dx,
2294                                   (7 << 6) - y * dx),
2295                 c3f),
2296             1);
2297       }
2298       a0_x = _mm256_cvtepu16_epi32(a0_x128);
2299       a1_x = _mm256_cvtepu16_epi32(a1_x128);
2300 
2301       diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2302       a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2303       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2304 
2305       b = _mm256_mullo_epi32(diff, shift);
2306       res = _mm256_add_epi32(a32, b);
2307       res = _mm256_srli_epi32(res, 5);
2308 
2309       resx = _mm256_castsi256_si128(_mm256_packus_epi32(
2310           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2311     }
2312     // y calc
2313     if (base_x < min_base_x) {
2314       DECLARE_ALIGNED(32, int, base_y_c[8]);
2315       __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
2316       r6 = _mm256_set1_epi32(r << 6);
2317       dy256 = _mm256_set1_epi32(dy);
2318       c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
2319       y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2320       base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2321       mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2322       base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2323       _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2324 
2325       a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2326           left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2327           left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2328           left[base_y_c[6]], left[base_y_c[7]]));
2329       a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2330           left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2331           left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2332           left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2333 
2334       if (upsample_left) {
2335         shift = _mm256_srli_epi32(
2336             _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
2337             1);
2338       } else {
2339         shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2340       }
2341       diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2342       a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2343       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2344 
2345       b = _mm256_mullo_epi32(diff, shift);
2346       res = _mm256_add_epi32(a32, b);
2347       res = _mm256_srli_epi32(res, 5);
2348 
2349       resy = _mm256_castsi256_si128(_mm256_packus_epi32(
2350           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2351     } else {
2352       resy = resx;
2353     }
2354     resxy =
2355         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2356     _mm_storeu_si128((__m128i *)(dst), resxy);
2357     dst += stride;
2358   }
2359 }
2360 
highbd_dr_prediction_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2361 static void highbd_dr_prediction_z2_Nx8_avx2(
2362     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2363     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2364     int dy) {
2365   const int min_base_x = -(1 << upsample_above);
2366   const int min_base_y = -(1 << upsample_left);
2367   const int frac_bits_x = 6 - upsample_above;
2368   const int frac_bits_y = 6 - upsample_left;
2369 
2370   // pre-filter above pixels
2371   // store in temp buffers:
2372   //   above[x] * 32 + 16
2373   //   above[x+1] - above[x]
2374   // final pixels will be calculated as:
2375   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2376   __m128i c3f, min_base_y128;
2377   __m256i a0_x, a1_x, diff, a32, a16;
2378   __m128i a0_x128, a1_x128;
2379 
2380   a16 = _mm256_set1_epi16(16);
2381   c3f = _mm_set1_epi16(0x3f);
2382   min_base_y128 = _mm_set1_epi16(min_base_y);
2383 
2384   for (int r = 0; r < N; r++) {
2385     __m256i b, res, shift;
2386     __m128i resx, resy, resxy;
2387     int y = r + 1;
2388     int base_x = (-y * dx) >> frac_bits_x;
2389     int base_shift = 0;
2390     if (base_x < (min_base_x - 1)) {
2391       base_shift = (min_base_x - base_x - 1) >> upsample_above;
2392     }
2393     int base_min_diff =
2394         (min_base_x - base_x + upsample_above) >> upsample_above;
2395     if (base_min_diff > 8) {
2396       base_min_diff = 8;
2397     } else {
2398       if (base_min_diff < 0) base_min_diff = 0;
2399     }
2400 
2401     if (base_shift > 7) {
2402       a0_x = _mm256_setzero_si256();
2403       a1_x = _mm256_setzero_si256();
2404       shift = _mm256_setzero_si256();
2405     } else {
2406       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2407       if (upsample_above) {
2408         __m128i mask, atmp0, atmp1, atmp2, atmp3;
2409         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2410         atmp0 = _mm_shuffle_epi8(a0_x128,
2411                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2412         atmp1 = _mm_shuffle_epi8(a1_x128,
2413                                  *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2414         atmp2 = _mm_shuffle_epi8(
2415             a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2416         atmp3 = _mm_shuffle_epi8(
2417             a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2418         mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2419                               _mm_set1_epi8(15));
2420         a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2421         mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2422                               _mm_set1_epi8(15));
2423         a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2424 
2425         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2426             _mm_and_si128(
2427                 _mm_slli_epi16(
2428                     _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2429                                    (2 << 6) - y * dx, (3 << 6) - y * dx,
2430                                    (4 << 6) - y * dx, (5 << 6) - y * dx,
2431                                    (6 << 6) - y * dx, (7 << 6) - y * dx),
2432                     upsample_above),
2433                 c3f),
2434             1));
2435       } else {
2436         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2437         a0_x128 =
2438             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2439         a1_x128 =
2440             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2441 
2442         shift = _mm256_castsi128_si256(_mm_srli_epi16(
2443             _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2444                                          (2 << 6) - y * dx, (3 << 6) - y * dx,
2445                                          (4 << 6) - y * dx, (5 << 6) - y * dx,
2446                                          (6 << 6) - y * dx, (7 << 6) - y * dx),
2447                           c3f),
2448             1));
2449       }
2450       a0_x = _mm256_castsi128_si256(a0_x128);
2451       a1_x = _mm256_castsi128_si256(a1_x128);
2452     }
2453 
2454     // y calc
2455     __m128i a0_y, a1_y, shifty;
2456     if (base_x < min_base_x) {
2457       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2458       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2459       r6 = _mm_set1_epi16(r << 6);
2460       dy128 = _mm_set1_epi16(dy);
2461       c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2462       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2463       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2464       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2465       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2466       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2467 
2468       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2469                             left[base_y_c[2]], left[base_y_c[3]],
2470                             left[base_y_c[4]], left[base_y_c[5]],
2471                             left[base_y_c[6]], left[base_y_c[7]]);
2472       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2473                             left[base_y_c[2] + 1], left[base_y_c[3] + 1],
2474                             left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2475                             left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
2476 
2477       if (upsample_left) {
2478         shifty = _mm_srli_epi16(
2479             _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
2480       } else {
2481         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2482       }
2483       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2484       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2485       shift = _mm256_inserti128_si256(shift, shifty, 1);
2486     }
2487 
2488     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2489     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2490     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2491 
2492     b = _mm256_mullo_epi16(diff, shift);
2493     res = _mm256_add_epi16(a32, b);
2494     res = _mm256_srli_epi16(res, 5);
2495 
2496     resx = _mm256_castsi256_si128(res);
2497     resy = _mm256_extracti128_si256(res, 1);
2498 
2499     resxy =
2500         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2501     _mm_storeu_si128((__m128i *)(dst), resxy);
2502     dst += stride;
2503   }
2504 }
2505 
highbd_dr_prediction_32bit_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2506 static void highbd_dr_prediction_32bit_z2_HxW_avx2(
2507     int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2508     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2509     int dy) {
2510   // here upsample_above and upsample_left are 0 by design of
2511   // av1_use_intra_edge_upsample
2512   const int min_base_x = -1;
2513   const int min_base_y = -1;
2514   (void)upsample_above;
2515   (void)upsample_left;
2516   const int frac_bits_x = 6;
2517   const int frac_bits_y = 6;
2518 
2519   // pre-filter above pixels
2520   // store in temp buffers:
2521   //   above[x] * 32 + 16
2522   //   above[x+1] - above[x]
2523   // final pixels will be calculated as:
2524   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2525   __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
2526   __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
2527   __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2528   DECLARE_ALIGNED(32, int, base_y_c[16]);
2529 
2530   a16 = _mm256_set1_epi32(16);
2531   c1 = _mm256_srli_epi32(a16, 4);
2532   c8 = _mm256_srli_epi32(a16, 1);
2533   min_base_y256 = _mm256_set1_epi32(min_base_y);
2534   c3f = _mm256_set1_epi32(0x3f);
2535   dy256 = _mm256_set1_epi32(dy);
2536   c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2537   c1234 = _mm256_add_epi32(c0123, c1);
2538 
2539   for (int r = 0; r < H; r++) {
2540     __m256i b, res, shift, ydx;
2541     __m256i resx[2], resy[2];
2542     __m256i resxy, j256, r6;
2543     for (int j = 0; j < W; j += 16) {
2544       j256 = _mm256_set1_epi32(j);
2545       int y = r + 1;
2546       ydx = _mm256_set1_epi32(y * dx);
2547 
2548       int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2549       int base_shift = 0;
2550       if ((base_x) < (min_base_x - 1)) {
2551         base_shift = (min_base_x - base_x - 1);
2552       }
2553       int base_min_diff = (min_base_x - base_x);
2554       if (base_min_diff > 16) {
2555         base_min_diff = 16;
2556       } else {
2557         if (base_min_diff < 0) base_min_diff = 0;
2558       }
2559 
2560       if (base_shift > 7) {
2561         resx[0] = _mm256_setzero_si256();
2562       } else {
2563         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2564         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2565         a0_x128 =
2566             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2567         a1_x128 =
2568             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2569 
2570         a0_x = _mm256_cvtepu16_epi32(a0_x128);
2571         a1_x = _mm256_cvtepu16_epi32(a1_x128);
2572 
2573         r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
2574         shift = _mm256_srli_epi32(
2575             _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2576 
2577         diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
2578         a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
2579         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2580 
2581         b = _mm256_mullo_epi32(diff, shift);
2582         res = _mm256_add_epi32(a32, b);
2583         res = _mm256_srli_epi32(res, 5);
2584 
2585         resx[0] = _mm256_packus_epi32(
2586             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2587       }
2588       int base_shift8 = 0;
2589       if ((base_x + 8) < (min_base_x - 1)) {
2590         base_shift8 = (min_base_x - (base_x + 8) - 1);
2591       }
2592       if (base_shift8 > 7) {
2593         resx[1] = _mm256_setzero_si256();
2594       } else {
2595         a0_1_x128 =
2596             _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
2597         a1_1_x128 =
2598             _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
2599         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2600                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
2601         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2602                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
2603 
2604         a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
2605         a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
2606 
2607         r6 = _mm256_slli_epi32(
2608             _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
2609         shift = _mm256_srli_epi32(
2610             _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2611 
2612         diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
2613         a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
2614         a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
2615         b = _mm256_mullo_epi32(diff, shift);
2616 
2617         resx[1] = _mm256_add_epi32(a32, b);
2618         resx[1] = _mm256_srli_epi32(resx[1], 5);
2619         resx[1] = _mm256_packus_epi32(
2620             resx[1],
2621             _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
2622       }
2623       resx[0] =
2624           _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
2625                                   1);  // 16 16bit values
2626 
2627       // y calc
2628       resy[0] = _mm256_setzero_si256();
2629       if ((base_x < min_base_x)) {
2630         __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
2631         r6 = _mm256_set1_epi32(r << 6);
2632         c256 = _mm256_add_epi32(j256, c1234);
2633         y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2634         base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2635         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2636         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2637         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2638         c256 = _mm256_add_epi32(c256, c8);
2639         y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2640         base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
2641         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2642         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2643         _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
2644 
2645         a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2646             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2647             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2648             left[base_y_c[6]], left[base_y_c[7]]));
2649         a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2650             left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2651             left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2652             left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2653 
2654         shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2655 
2656         diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2657         a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2658         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2659 
2660         b = _mm256_mullo_epi32(diff, shift);
2661         res = _mm256_add_epi32(a32, b);
2662         res = _mm256_srli_epi32(res, 5);
2663 
2664         resy[0] = _mm256_packus_epi32(
2665             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2666 
2667         a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2668             left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
2669             left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
2670             left[base_y_c[14]], left[base_y_c[15]]));
2671         a1_y = _mm256_cvtepu16_epi32(
2672             _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
2673                            left[base_y_c[10] + 1], left[base_y_c[11] + 1],
2674                            left[base_y_c[12] + 1], left[base_y_c[13] + 1],
2675                            left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
2676         shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
2677 
2678         diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
2679         a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
2680         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
2681 
2682         b = _mm256_mullo_epi32(diff, shift);
2683         res = _mm256_add_epi32(a32, b);
2684         res = _mm256_srli_epi32(res, 5);
2685 
2686         resy[1] = _mm256_packus_epi32(
2687             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2688 
2689         resy[0] =
2690             _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
2691                                     1);  // 16 16bit values
2692       }
2693 
2694       resxy = _mm256_blendv_epi8(resx[0], resy[0],
2695                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
2696       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2697     }  // for j
2698     dst += stride;
2699   }
2700 }
2701 
highbd_dr_prediction_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2702 static void highbd_dr_prediction_z2_HxW_avx2(
2703     int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2704     const uint16_t *left, int upsample_above, int upsample_left, int dx,
2705     int dy) {
2706   // here upsample_above and upsample_left are 0 by design of
2707   // av1_use_intra_edge_upsample
2708   const int min_base_x = -1;
2709   const int min_base_y = -1;
2710   (void)upsample_above;
2711   (void)upsample_left;
2712   const int frac_bits_x = 6;
2713   const int frac_bits_y = 6;
2714 
2715   // pre-filter above pixels
2716   // store in temp buffers:
2717   //   above[x] * 32 + 16
2718   //   above[x+1] - above[x]
2719   // final pixels will be calculated as:
2720   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2721   __m256i a0_x, a1_x, a32, a16, c3f, c1;
2722   __m256i diff, min_base_y256, dy256, c1234, c0123;
2723   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2724 
2725   a16 = _mm256_set1_epi16(16);
2726   c1 = _mm256_srli_epi16(a16, 4);
2727   min_base_y256 = _mm256_set1_epi16(min_base_y);
2728   c3f = _mm256_set1_epi16(0x3f);
2729   dy256 = _mm256_set1_epi16(dy);
2730   c0123 =
2731       _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2732   c1234 = _mm256_add_epi16(c0123, c1);
2733 
2734   for (int r = 0; r < H; r++) {
2735     __m256i b, res, shift;
2736     __m256i resx, resy, ydx;
2737     __m256i resxy, j256, r6;
2738     __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2739     int y = r + 1;
2740     ydx = _mm256_set1_epi16((short)(y * dx));
2741 
2742     for (int j = 0; j < W; j += 16) {
2743       j256 = _mm256_set1_epi16(j);
2744       int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2745       int base_shift = 0;
2746       if ((base_x) < (min_base_x - 1)) {
2747         base_shift = (min_base_x - (base_x)-1);
2748       }
2749       int base_min_diff = (min_base_x - base_x);
2750       if (base_min_diff > 16) {
2751         base_min_diff = 16;
2752       } else {
2753         if (base_min_diff < 0) base_min_diff = 0;
2754       }
2755 
2756       if (base_shift < 8) {
2757         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2758         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2759         a0_x128 =
2760             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2761         a1_x128 =
2762             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2763 
2764         a0_x = _mm256_castsi128_si256(a0_x128);
2765         a1_x = _mm256_castsi128_si256(a1_x128);
2766       } else {
2767         a0_x = _mm256_setzero_si256();
2768         a1_x = _mm256_setzero_si256();
2769       }
2770 
2771       int base_shift1 = 0;
2772       if (base_shift > 8) {
2773         base_shift1 = base_shift - 8;
2774       }
2775       if (base_shift1 < 8) {
2776         a0_1_x128 =
2777             _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
2778         a1_1_x128 =
2779             _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
2780         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2781                                      *(__m128i *)HighbdLoadMaskx[base_shift1]);
2782         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2783                                      *(__m128i *)HighbdLoadMaskx[base_shift1]);
2784 
2785         a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
2786         a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
2787       }
2788       r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2789       shift = _mm256_srli_epi16(
2790           _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2791 
2792       diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
2793       a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
2794       a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2795 
2796       b = _mm256_mullo_epi16(diff, shift);
2797       res = _mm256_add_epi16(a32, b);
2798       resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
2799 
2800       // y calc
2801       resy = _mm256_setzero_si256();
2802       __m256i a0_y, a1_y, shifty;
2803       if ((base_x < min_base_x)) {
2804         __m256i c256, y_c256, base_y_c256, mask256, mul16;
2805         r6 = _mm256_set1_epi16(r << 6);
2806         c256 = _mm256_add_epi16(j256, c1234);
2807         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2808                                  _mm256_srli_epi16(min_base_y256, 1));
2809         y_c256 = _mm256_sub_epi16(r6, mul16);
2810         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2811         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2812         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2813         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2814 
2815         a0_y = _mm256_setr_epi16(
2816             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2817             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2818             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2819             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2820             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2821             left[base_y_c[15]]);
2822         base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2823         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2824 
2825         a1_y = _mm256_setr_epi16(
2826             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2827             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2828             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2829             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2830             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2831             left[base_y_c[15]]);
2832 
2833         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2834 
2835         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
2836         a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
2837         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
2838 
2839         b = _mm256_mullo_epi16(diff, shifty);
2840         res = _mm256_add_epi16(a32, b);
2841         resy = _mm256_srli_epi16(res, 5);
2842       }
2843 
2844       resxy = _mm256_blendv_epi8(resx, resy,
2845                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
2846       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2847     }  // for j
2848     dst += stride;
2849   }
2850 }
2851 
2852 // Directional prediction, zone 2: 90 < angle < 180
av1_highbd_dr_prediction_z2_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2853 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
2854                                       int bh, const uint16_t *above,
2855                                       const uint16_t *left, int upsample_above,
2856                                       int upsample_left, int dx, int dy,
2857                                       int bd) {
2858   (void)bd;
2859   assert(dx > 0);
2860   assert(dy > 0);
2861   switch (bw) {
2862     case 4:
2863       if (bd < 12) {
2864         highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
2865                                          upsample_above, upsample_left, dx, dy);
2866       } else {
2867         highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
2868                                                upsample_above, upsample_left,
2869                                                dx, dy);
2870       }
2871       break;
2872     case 8:
2873       if (bd < 12) {
2874         highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
2875                                          upsample_above, upsample_left, dx, dy);
2876       } else {
2877         highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
2878                                                upsample_above, upsample_left,
2879                                                dx, dy);
2880       }
2881       break;
2882     default:
2883       if (bd < 12) {
2884         highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2885                                          upsample_above, upsample_left, dx, dy);
2886       } else {
2887         highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2888                                                upsample_above, upsample_left,
2889                                                dx, dy);
2890       }
2891       break;
2892   }
2893 }
2894 
2895 //  Directional prediction, zone 3 functions
highbd_dr_prediction_z3_4x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2896 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
2897                                              const uint16_t *left,
2898                                              int upsample_left, int dy,
2899                                              int bd) {
2900   __m128i dstvec[4], d[4];
2901   if (bd < 12) {
2902     highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
2903                                               dy);
2904   } else {
2905     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
2906                                                     upsample_left, dy);
2907   }
2908   highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
2909                                    &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
2910   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2911   _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2912   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2913   _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2914   return;
2915 }
2916 
highbd_dr_prediction_z3_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2917 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2918                                              const uint16_t *left,
2919                                              int upsample_left, int dy,
2920                                              int bd) {
2921   __m128i dstvec[8], d[8];
2922   if (bd < 12) {
2923     highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
2924                                               dy);
2925   } else {
2926     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
2927                                                     upsample_left, dy);
2928   }
2929   highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2930                            &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2931                            &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2932                            &d[7]);
2933   for (int i = 0; i < 8; i++) {
2934     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2935   }
2936 }
2937 
highbd_dr_prediction_z3_4x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2938 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
2939                                              const uint16_t *left,
2940                                              int upsample_left, int dy,
2941                                              int bd) {
2942   __m128i dstvec[4], d[8];
2943   if (bd < 12) {
2944     highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
2945                                               dy);
2946   } else {
2947     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
2948                                                     upsample_left, dy);
2949   }
2950 
2951   highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2952                                &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2953                                &d[7]);
2954   for (int i = 0; i < 8; i++) {
2955     _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2956   }
2957 }
2958 
highbd_dr_prediction_z3_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2959 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2960                                              const uint16_t *left,
2961                                              int upsample_left, int dy,
2962                                              int bd) {
2963   __m128i dstvec[8], d[4];
2964   if (bd < 12) {
2965     highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
2966                                               dy);
2967   } else {
2968     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
2969                                                     upsample_left, dy);
2970   }
2971 
2972   highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2973                                &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2974                                &d[0], &d[1], &d[2], &d[3]);
2975   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
2976   _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
2977   _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
2978   _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
2979 }
2980 
highbd_dr_prediction_z3_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2981 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2982                                               const uint16_t *left,
2983                                               int upsample_left, int dy,
2984                                               int bd) {
2985   __m256i dstvec[8], d[8];
2986   if (bd < 12) {
2987     highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
2988                                                dy);
2989   } else {
2990     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
2991                                                      upsample_left, dy);
2992   }
2993   highbd_transpose8x16_16x8_avx2(dstvec, d);
2994   for (int i = 0; i < 8; i++) {
2995     _mm_storeu_si128((__m128i *)(dst + i * stride),
2996                      _mm256_castsi256_si128(d[i]));
2997   }
2998   for (int i = 8; i < 16; i++) {
2999     _mm_storeu_si128((__m128i *)(dst + i * stride),
3000                      _mm256_extracti128_si256(d[i - 8], 1));
3001   }
3002 }
3003 
highbd_dr_prediction_z3_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3004 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3005                                               const uint16_t *left,
3006                                               int upsample_left, int dy,
3007                                               int bd) {
3008   __m128i dstvec[16], d[16];
3009   if (bd < 12) {
3010     highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3011                                               dy);
3012   } else {
3013     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
3014                                                     upsample_left, dy);
3015   }
3016   for (int i = 0; i < 16; i += 8) {
3017     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3018                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3019                              &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3020                              &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3021                              &d[5 + i], &d[6 + i], &d[7 + i]);
3022   }
3023   for (int i = 0; i < 8; i++) {
3024     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3025     _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3026   }
3027 }
3028 
highbd_dr_prediction_z3_4x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3029 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3030                                               const uint16_t *left,
3031                                               int upsample_left, int dy,
3032                                               int bd) {
3033   __m256i dstvec[4], d[4], d1;
3034   if (bd < 12) {
3035     highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3036                                                dy);
3037   } else {
3038     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
3039                                                      upsample_left, dy);
3040   }
3041   highbd_transpose4x16_avx2(dstvec, d);
3042   for (int i = 0; i < 4; i++) {
3043     _mm_storel_epi64((__m128i *)(dst + i * stride),
3044                      _mm256_castsi256_si128(d[i]));
3045     d1 = _mm256_bsrli_epi128(d[i], 8);
3046     _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3047                      _mm256_castsi256_si128(d1));
3048     _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3049                      _mm256_extracti128_si256(d[i], 1));
3050     _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3051                      _mm256_extracti128_si256(d1, 1));
3052   }
3053 }
3054 
highbd_dr_prediction_z3_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3055 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3056                                               const uint16_t *left,
3057                                               int upsample_left, int dy,
3058                                               int bd) {
3059   __m128i dstvec[16], d[8];
3060   if (bd < 12) {
3061     highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3062                                               dy);
3063   } else {
3064     highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
3065                                                     upsample_left, dy);
3066   }
3067   highbd_transpose16x4_8x8_sse2(dstvec, d);
3068 
3069   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3070   _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3071   _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3072   _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3073   _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3074   _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3075   _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3076   _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3077 }
3078 
highbd_dr_prediction_z3_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3079 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3080                                               const uint16_t *left,
3081                                               int upsample_left, int dy,
3082                                               int bd) {
3083   __m256i dstvec[16], d[16];
3084   if (bd < 12) {
3085     highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3086                                                dy);
3087   } else {
3088     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
3089                                                      upsample_left, dy);
3090   }
3091 
3092   for (int i = 0; i < 16; i += 8) {
3093     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3094   }
3095 
3096   for (int i = 0; i < 8; i++) {
3097     _mm_storeu_si128((__m128i *)(dst + i * stride),
3098                      _mm256_castsi256_si128(d[i]));
3099   }
3100   for (int i = 0; i < 8; i++) {
3101     _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3102                      _mm256_extracti128_si256(d[i], 1));
3103   }
3104   for (int i = 8; i < 16; i++) {
3105     _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3106                      _mm256_castsi256_si128(d[i]));
3107   }
3108   for (int i = 8; i < 16; i++) {
3109     _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
3110                      _mm256_extracti128_si256(d[i], 1));
3111   }
3112 }
3113 
highbd_dr_prediction_z3_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3114 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
3115                                               const uint16_t *left,
3116                                               int upsample_left, int dy,
3117                                               int bd) {
3118   __m128i dstvec[32], d[32];
3119   if (bd < 12) {
3120     highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
3121                                               dy);
3122   } else {
3123     highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
3124                                                     upsample_left, dy);
3125   }
3126 
3127   for (int i = 0; i < 32; i += 8) {
3128     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3129                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3130                              &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3131                              &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3132                              &d[5 + i], &d[6 + i], &d[7 + i]);
3133   }
3134   for (int i = 0; i < 8; i++) {
3135     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3136     _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3137     _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
3138     _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
3139   }
3140 }
3141 
highbd_dr_prediction_z3_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3142 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
3143                                                const uint16_t *left,
3144                                                int upsample_left, int dy,
3145                                                int bd) {
3146   __m256i dstvec[16], d[16];
3147   if (bd < 12) {
3148     highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
3149                                                dy);
3150   } else {
3151     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
3152                                                      upsample_left, dy);
3153   }
3154 
3155   highbd_transpose16x16_avx2(dstvec, d);
3156 
3157   for (int i = 0; i < 16; i++) {
3158     _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
3159   }
3160 }
3161 
highbd_dr_prediction_z3_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3162 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
3163                                                const uint16_t *left,
3164                                                int upsample_left, int dy,
3165                                                int bd) {
3166   __m256i dstvec[64], d[16];
3167   if (bd < 12) {
3168     highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
3169                                                dy);
3170   } else {
3171     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
3172                                                      upsample_left, dy);
3173   }
3174   highbd_transpose16x16_avx2(dstvec, d);
3175   for (int j = 0; j < 16; j++) {
3176     _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
3177   }
3178   highbd_transpose16x16_avx2(dstvec + 16, d);
3179   for (int j = 0; j < 16; j++) {
3180     _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
3181   }
3182   highbd_transpose16x16_avx2(dstvec + 32, d);
3183   for (int j = 0; j < 16; j++) {
3184     _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
3185   }
3186   highbd_transpose16x16_avx2(dstvec + 48, d);
3187   for (int j = 0; j < 16; j++) {
3188     _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
3189   }
3190 }
3191 
highbd_dr_prediction_z3_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3192 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
3193                                                const uint16_t *left,
3194                                                int upsample_left, int dy,
3195                                                int bd) {
3196   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
3197   if (bd < 12) {
3198     highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
3199   } else {
3200     highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
3201                                             dy);
3202   }
3203   highbd_transpose(dstT, 64, dst, stride, 64, 64);
3204 }
3205 
highbd_dr_prediction_z3_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3206 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
3207                                                const uint16_t *left,
3208                                                int upsample_left, int dy,
3209                                                int bd) {
3210   __m256i dstvec[32], d[32];
3211   if (bd < 12) {
3212     highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
3213                                                dy);
3214   } else {
3215     highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
3216                                                      upsample_left, dy);
3217   }
3218   for (int i = 0; i < 32; i += 8) {
3219     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3220   }
3221   // store
3222   for (int j = 0; j < 32; j += 16) {
3223     for (int i = 0; i < 8; i++) {
3224       _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
3225                        _mm256_castsi256_si128(d[(i + j)]));
3226     }
3227     for (int i = 0; i < 8; i++) {
3228       _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
3229                        _mm256_castsi256_si128(d[(i + j) + 8]));
3230     }
3231     for (int i = 8; i < 16; i++) {
3232       _mm256_storeu_si256(
3233           (__m256i *)(dst + (i + j) * stride),
3234           _mm256_inserti128_si256(
3235               d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
3236     }
3237   }
3238 }
3239 
highbd_dr_prediction_z3_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3240 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
3241                                                const uint16_t *left,
3242                                                int upsample_left, int dy,
3243                                                int bd) {
3244   __m256i dstvec[32], d[16];
3245   if (bd < 12) {
3246     highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
3247                                                dy);
3248   } else {
3249     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
3250                                                      upsample_left, dy);
3251   }
3252   for (int i = 0; i < 32; i += 16) {
3253     highbd_transpose16x16_avx2((dstvec + i), d);
3254     for (int j = 0; j < 16; j++) {
3255       _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3256     }
3257   }
3258 }
3259 
highbd_dr_prediction_z3_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3260 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
3261                                                const uint16_t *left,
3262                                                int upsample_left, int dy,
3263                                                int bd) {
3264   uint16_t dstT[64 * 32];
3265   if (bd < 12) {
3266     highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
3267   } else {
3268     highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
3269                                             dy);
3270   }
3271   highbd_transpose(dstT, 64, dst, stride, 32, 64);
3272 }
3273 
highbd_dr_prediction_z3_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3274 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
3275                                                const uint16_t *left,
3276                                                int upsample_left, int dy,
3277                                                int bd) {
3278   DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
3279   highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
3280   highbd_transpose(dstT, 32, dst, stride, 64, 32);
3281   return;
3282 }
3283 
highbd_dr_prediction_z3_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3284 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
3285                                                const uint16_t *left,
3286                                                int upsample_left, int dy,
3287                                                int bd) {
3288   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
3289   if (bd < 12) {
3290     highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
3291   } else {
3292     highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
3293                                             dy);
3294   }
3295   highbd_transpose(dstT, 64, dst, stride, 16, 64);
3296 }
3297 
highbd_dr_prediction_z3_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3298 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
3299                                                const uint16_t *left,
3300                                                int upsample_left, int dy,
3301                                                int bd) {
3302   __m256i dstvec[64], d[16];
3303   if (bd < 12) {
3304     highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
3305                                                dy);
3306   } else {
3307     highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
3308                                                      upsample_left, dy);
3309   }
3310   for (int i = 0; i < 64; i += 16) {
3311     highbd_transpose16x16_avx2((dstvec + i), d);
3312     for (int j = 0; j < 16; j++) {
3313       _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3314     }
3315   }
3316 }
3317 
av1_highbd_dr_prediction_z3_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_left,int dx,int dy,int bd)3318 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
3319                                       int bh, const uint16_t *above,
3320                                       const uint16_t *left, int upsample_left,
3321                                       int dx, int dy, int bd) {
3322   (void)above;
3323   (void)dx;
3324 
3325   assert(dx == 1);
3326   assert(dy > 0);
3327   if (bw == bh) {
3328     switch (bw) {
3329       case 4:
3330         highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
3331                                          bd);
3332         break;
3333       case 8:
3334         highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
3335                                          bd);
3336         break;
3337       case 16:
3338         highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
3339                                            bd);
3340         break;
3341       case 32:
3342         highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
3343                                            bd);
3344         break;
3345       case 64:
3346         highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
3347                                            bd);
3348         break;
3349     }
3350   } else {
3351     if (bw < bh) {
3352       if (bw + bw == bh) {
3353         switch (bw) {
3354           case 4:
3355             highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
3356                                              dy, bd);
3357             break;
3358           case 8:
3359             highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
3360                                               dy, bd);
3361             break;
3362           case 16:
3363             highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
3364                                                dy, bd);
3365             break;
3366           case 32:
3367             highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
3368                                                dy, bd);
3369             break;
3370         }
3371       } else {
3372         switch (bw) {
3373           case 4:
3374             highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
3375                                               dy, bd);
3376             break;
3377           case 8:
3378             highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
3379                                               dy, bd);
3380             break;
3381           case 16:
3382             highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
3383                                                dy, bd);
3384             break;
3385         }
3386       }
3387     } else {
3388       if (bh + bh == bw) {
3389         switch (bh) {
3390           case 4:
3391             highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
3392                                              dy, bd);
3393             break;
3394           case 8:
3395             highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
3396                                               dy, bd);
3397             break;
3398           case 16:
3399             highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
3400                                                dy, bd);
3401             break;
3402           case 32:
3403             highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
3404                                                dy, bd);
3405             break;
3406         }
3407       } else {
3408         switch (bh) {
3409           case 4:
3410             highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
3411                                               dy, bd);
3412             break;
3413           case 8:
3414             highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
3415                                               dy, bd);
3416             break;
3417           case 16:
3418             highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
3419                                                dy, bd);
3420             break;
3421         }
3422       }
3423     }
3424   }
3425   return;
3426 }
3427 
3428 // Low bit depth functions
3429 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
3430   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3431     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3432   { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3433     0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3434   { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3435     0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3436   { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3437     0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3438   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3439     0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3440   { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3441     0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3442   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3443     0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3444   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3445     0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
3446   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3447     0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
3448   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3449     0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
3450   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3451     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3452     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3453   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3454     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3455     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3456   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3457     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
3458     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3459   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3460     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
3461     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3462   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3463     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
3464     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3465   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3466     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
3467     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3468   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3469     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
3470     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3471   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3472     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
3473     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3474   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3475     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
3476     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3477   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3478     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
3479     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3480   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3481     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
3482     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3483   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3484     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3485     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3486   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3487     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3488     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
3489   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3490     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3491     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
3492   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3493     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3494     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
3495   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3496     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3497     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
3498   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3499     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3500     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
3501   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3502     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3503     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
3504   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3505     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3506     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
3507   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3508     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3509     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
3510   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3511     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3512     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
3513   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3514     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3515     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
3516   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3517     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3518     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
3519 };
3520 
3521 /* clang-format on */
dr_prediction_z1_HxW_internal_avx2(int H,int W,__m128i * dst,const uint8_t * above,int upsample_above,int dx)3522 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
3523     int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
3524     int dx) {
3525   const int frac_bits = 6 - upsample_above;
3526   const int max_base_x = ((W + H) - 1) << upsample_above;
3527 
3528   assert(dx > 0);
3529   // pre-filter above pixels
3530   // store in temp buffers:
3531   //   above[x] * 32 + 16
3532   //   above[x+1] - above[x]
3533   // final pixels will be calculated as:
3534   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3535   __m256i a0, a1, a32, a16;
3536   __m256i diff, c3f;
3537   __m128i a_mbase_x;
3538 
3539   a16 = _mm256_set1_epi16(16);
3540   a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
3541   c3f = _mm256_set1_epi16(0x3f);
3542 
3543   int x = dx;
3544   for (int r = 0; r < W; r++) {
3545     __m256i b, res, shift;
3546     __m128i res1, a0_128, a1_128;
3547 
3548     int base = x >> frac_bits;
3549     int base_max_diff = (max_base_x - base) >> upsample_above;
3550     if (base_max_diff <= 0) {
3551       for (int i = r; i < W; ++i) {
3552         dst[i] = a_mbase_x;  // save 4 values
3553       }
3554       return;
3555     }
3556     if (base_max_diff > H) base_max_diff = H;
3557     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
3558     a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
3559 
3560     if (upsample_above) {
3561       a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
3562       a1_128 = _mm_srli_si128(a0_128, 8);
3563 
3564       shift = _mm256_srli_epi16(
3565           _mm256_and_si256(
3566               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
3567           1);
3568     } else {
3569       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3570     }
3571     a0 = _mm256_cvtepu8_epi16(a0_128);
3572     a1 = _mm256_cvtepu8_epi16(a1_128);
3573 
3574     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3575     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3576     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3577 
3578     b = _mm256_mullo_epi16(diff, shift);
3579     res = _mm256_add_epi16(a32, b);
3580     res = _mm256_srli_epi16(res, 5);
3581 
3582     res = _mm256_packus_epi16(
3583         res, _mm256_castsi128_si256(
3584                  _mm256_extracti128_si256(res, 1)));  // goto 8 bit
3585     res1 = _mm256_castsi256_si128(res);               // 16 8bit values
3586 
3587     dst[r] =
3588         _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
3589     x += dx;
3590   }
3591 }
3592 
dr_prediction_z1_4xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3593 static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3594                                       const uint8_t *above, int upsample_above,
3595                                       int dx) {
3596   __m128i dstvec[16];
3597 
3598   dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
3599   for (int i = 0; i < N; i++) {
3600     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
3601   }
3602 }
3603 
dr_prediction_z1_8xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3604 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3605                                       const uint8_t *above, int upsample_above,
3606                                       int dx) {
3607   __m128i dstvec[32];
3608 
3609   dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
3610   for (int i = 0; i < N; i++) {
3611     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
3612   }
3613 }
3614 
dr_prediction_z1_16xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3615 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3616                                        const uint8_t *above, int upsample_above,
3617                                        int dx) {
3618   __m128i dstvec[64];
3619 
3620   dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
3621   for (int i = 0; i < N; i++) {
3622     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
3623   }
3624 }
3625 
dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint8_t * above,int upsample_above,int dx)3626 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
3627     int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
3628   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3629   (void)upsample_above;
3630   const int frac_bits = 6;
3631   const int max_base_x = ((32 + N) - 1);
3632 
3633   // pre-filter above pixels
3634   // store in temp buffers:
3635   //   above[x] * 32 + 16
3636   //   above[x+1] - above[x]
3637   // final pixels will be calculated as:
3638   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3639   __m256i a0, a1, a32, a16;
3640   __m256i a_mbase_x, diff, c3f;
3641 
3642   a16 = _mm256_set1_epi16(16);
3643   a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3644   c3f = _mm256_set1_epi16(0x3f);
3645 
3646   int x = dx;
3647   for (int r = 0; r < N; r++) {
3648     __m256i b, res, res16[2];
3649     __m128i a0_128, a1_128;
3650 
3651     int base = x >> frac_bits;
3652     int base_max_diff = (max_base_x - base);
3653     if (base_max_diff <= 0) {
3654       for (int i = r; i < N; ++i) {
3655         dstvec[i] = a_mbase_x;  // save 32 values
3656       }
3657       return;
3658     }
3659     if (base_max_diff > 32) base_max_diff = 32;
3660     __m256i shift =
3661         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3662 
3663     for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
3664       int mdiff = base_max_diff - j;
3665       if (mdiff <= 0) {
3666         res16[jj] = a_mbase_x;
3667       } else {
3668         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3669         a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
3670         a0 = _mm256_cvtepu8_epi16(a0_128);
3671         a1 = _mm256_cvtepu8_epi16(a1_128);
3672 
3673         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3674         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3675         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3676         b = _mm256_mullo_epi16(diff, shift);
3677 
3678         res = _mm256_add_epi16(a32, b);
3679         res = _mm256_srli_epi16(res, 5);
3680         res16[jj] = _mm256_packus_epi16(
3681             res, _mm256_castsi128_si256(
3682                      _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3683       }
3684     }
3685     res16[1] =
3686         _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
3687                                 1);  // 32 8bit values
3688 
3689     dstvec[r] = _mm256_blendv_epi8(
3690         a_mbase_x, res16[1],
3691         *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
3692     x += dx;
3693   }
3694 }
3695 
dr_prediction_z1_32xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3696 static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3697                                        const uint8_t *above, int upsample_above,
3698                                        int dx) {
3699   __m256i dstvec[64];
3700   dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
3701   for (int i = 0; i < N; i++) {
3702     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
3703   }
3704 }
3705 
dr_prediction_z1_64xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3706 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3707                                        const uint8_t *above, int upsample_above,
3708                                        int dx) {
3709   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3710   (void)upsample_above;
3711   const int frac_bits = 6;
3712   const int max_base_x = ((64 + N) - 1);
3713 
3714   // pre-filter above pixels
3715   // store in temp buffers:
3716   //   above[x] * 32 + 16
3717   //   above[x+1] - above[x]
3718   // final pixels will be calculated as:
3719   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3720   __m256i a0, a1, a32, a16;
3721   __m256i a_mbase_x, diff, c3f;
3722   __m128i max_base_x128, base_inc128, mask128;
3723 
3724   a16 = _mm256_set1_epi16(16);
3725   a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3726   max_base_x128 = _mm_set1_epi8(max_base_x);
3727   c3f = _mm256_set1_epi16(0x3f);
3728 
3729   int x = dx;
3730   for (int r = 0; r < N; r++, dst += stride) {
3731     __m256i b, res;
3732     int base = x >> frac_bits;
3733     if (base >= max_base_x) {
3734       for (int i = r; i < N; ++i) {
3735         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
3736         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
3737         dst += stride;
3738       }
3739       return;
3740     }
3741 
3742     __m256i shift =
3743         _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3744 
3745     __m128i a0_128, a1_128, res128;
3746     for (int j = 0; j < 64; j += 16) {
3747       int mdif = max_base_x - (base + j);
3748       if (mdif <= 0) {
3749         _mm_storeu_si128((__m128i *)(dst + j),
3750                          _mm256_castsi256_si128(a_mbase_x));
3751       } else {
3752         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3753         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
3754         a0 = _mm256_cvtepu8_epi16(a0_128);
3755         a1 = _mm256_cvtepu8_epi16(a1_128);
3756 
3757         diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
3758         a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
3759         a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
3760         b = _mm256_mullo_epi16(diff, shift);
3761 
3762         res = _mm256_add_epi16(a32, b);
3763         res = _mm256_srli_epi16(res, 5);
3764         res = _mm256_packus_epi16(
3765             res, _mm256_castsi128_si256(
3766                      _mm256_extracti128_si256(res, 1)));  // 16 8bit values
3767 
3768         base_inc128 =
3769             _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
3770                           (int8_t)(base + j + 2), (int8_t)(base + j + 3),
3771                           (int8_t)(base + j + 4), (int8_t)(base + j + 5),
3772                           (int8_t)(base + j + 6), (int8_t)(base + j + 7),
3773                           (int8_t)(base + j + 8), (int8_t)(base + j + 9),
3774                           (int8_t)(base + j + 10), (int8_t)(base + j + 11),
3775                           (int8_t)(base + j + 12), (int8_t)(base + j + 13),
3776                           (int8_t)(base + j + 14), (int8_t)(base + j + 15));
3777 
3778         mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
3779                                  _mm_setzero_si128());
3780         res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
3781                                  _mm256_castsi256_si128(res), mask128);
3782         _mm_storeu_si128((__m128i *)(dst + j), res128);
3783       }
3784     }
3785     x += dx;
3786   }
3787 }
3788 
3789 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)3790 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
3791                                const uint8_t *above, const uint8_t *left,
3792                                int upsample_above, int dx, int dy) {
3793   (void)left;
3794   (void)dy;
3795   switch (bw) {
3796     case 4:
3797       dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
3798       break;
3799     case 8:
3800       dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
3801       break;
3802     case 16:
3803       dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
3804       break;
3805     case 32:
3806       dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
3807       break;
3808     case 64:
3809       dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
3810       break;
3811     default: break;
3812   }
3813   return;
3814 }
3815 
dr_prediction_z2_Nx4_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3816 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3817                                       const uint8_t *above, const uint8_t *left,
3818                                       int upsample_above, int upsample_left,
3819                                       int dx, int dy) {
3820   const int min_base_x = -(1 << upsample_above);
3821   const int min_base_y = -(1 << upsample_left);
3822   const int frac_bits_x = 6 - upsample_above;
3823   const int frac_bits_y = 6 - upsample_left;
3824 
3825   assert(dx > 0);
3826   // pre-filter above pixels
3827   // store in temp buffers:
3828   //   above[x] * 32 + 16
3829   //   above[x+1] - above[x]
3830   // final pixels will be calculated as:
3831   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3832   __m128i a0_x, a1_x, a32, a16, diff;
3833   __m128i c3f, min_base_y128, c1234, dy128;
3834 
3835   a16 = _mm_set1_epi16(16);
3836   c3f = _mm_set1_epi16(0x3f);
3837   min_base_y128 = _mm_set1_epi16(min_base_y);
3838   c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
3839   dy128 = _mm_set1_epi16(dy);
3840 
3841   for (int r = 0; r < N; r++) {
3842     __m128i b, res, shift, r6, ydx;
3843     __m128i resx, resy, resxy;
3844     __m128i a0_x128, a1_x128;
3845     int y = r + 1;
3846     int base_x = (-y * dx) >> frac_bits_x;
3847     int base_shift = 0;
3848     if (base_x < (min_base_x - 1)) {
3849       base_shift = (min_base_x - base_x - 1) >> upsample_above;
3850     }
3851     int base_min_diff =
3852         (min_base_x - base_x + upsample_above) >> upsample_above;
3853     if (base_min_diff > 4) {
3854       base_min_diff = 4;
3855     } else {
3856       if (base_min_diff < 0) base_min_diff = 0;
3857     }
3858 
3859     if (base_shift > 3) {
3860       a0_x = _mm_setzero_si128();
3861       a1_x = _mm_setzero_si128();
3862       shift = _mm_setzero_si128();
3863     } else {
3864       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3865       ydx = _mm_set1_epi16(y * dx);
3866       r6 = _mm_slli_epi16(c1234, 6);
3867 
3868       if (upsample_above) {
3869         a0_x128 =
3870             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3871         a1_x128 = _mm_srli_si128(a0_x128, 8);
3872 
3873         shift = _mm_srli_epi16(
3874             _mm_and_si128(
3875                 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3876             1);
3877       } else {
3878         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3879         a1_x128 = _mm_srli_si128(a0_x128, 1);
3880 
3881         shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
3882       }
3883       a0_x = _mm_cvtepu8_epi16(a0_x128);
3884       a1_x = _mm_cvtepu8_epi16(a1_x128);
3885     }
3886     // y calc
3887     __m128i a0_y, a1_y, shifty;
3888     if (base_x < min_base_x) {
3889       DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3890       __m128i y_c128, base_y_c128, mask128, c1234_;
3891       c1234_ = _mm_srli_si128(c1234, 2);
3892       r6 = _mm_set1_epi16(r << 6);
3893       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
3894       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3895       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3896       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3897       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3898 
3899       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3900                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3901       base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
3902       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3903       a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3904                             left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3905 
3906       if (upsample_left) {
3907         shifty = _mm_srli_epi16(
3908             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3909       } else {
3910         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3911       }
3912       a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
3913       a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
3914       shift = _mm_unpacklo_epi64(shift, shifty);
3915     }
3916 
3917     diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
3918     a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
3919     a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
3920 
3921     b = _mm_mullo_epi16(diff, shift);
3922     res = _mm_add_epi16(a32, b);
3923     res = _mm_srli_epi16(res, 5);
3924 
3925     resx = _mm_packus_epi16(res, res);
3926     resy = _mm_srli_si128(resx, 4);
3927 
3928     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
3929     *(int *)(dst) = _mm_cvtsi128_si32(resxy);
3930     dst += stride;
3931   }
3932 }
3933 
dr_prediction_z2_Nx8_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3934 static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3935                                       const uint8_t *above, const uint8_t *left,
3936                                       int upsample_above, int upsample_left,
3937                                       int dx, int dy) {
3938   const int min_base_x = -(1 << upsample_above);
3939   const int min_base_y = -(1 << upsample_left);
3940   const int frac_bits_x = 6 - upsample_above;
3941   const int frac_bits_y = 6 - upsample_left;
3942 
3943   // pre-filter above pixels
3944   // store in temp buffers:
3945   //   above[x] * 32 + 16
3946   //   above[x+1] - above[x]
3947   // final pixels will be calculated as:
3948   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3949   __m256i diff, a32, a16;
3950   __m256i a0_x, a1_x;
3951   __m128i a0_x128, a1_x128, min_base_y128, c3f;
3952   __m128i c1234, dy128;
3953 
3954   a16 = _mm256_set1_epi16(16);
3955   c3f = _mm_set1_epi16(0x3f);
3956   min_base_y128 = _mm_set1_epi16(min_base_y);
3957   dy128 = _mm_set1_epi16(dy);
3958   c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3959 
3960   for (int r = 0; r < N; r++) {
3961     __m256i b, res, shift;
3962     __m128i resx, resy, resxy, r6, ydx;
3963 
3964     int y = r + 1;
3965     int base_x = (-y * dx) >> frac_bits_x;
3966     int base_shift = 0;
3967     if (base_x < (min_base_x - 1)) {
3968       base_shift = (min_base_x - base_x - 1) >> upsample_above;
3969     }
3970     int base_min_diff =
3971         (min_base_x - base_x + upsample_above) >> upsample_above;
3972     if (base_min_diff > 8) {
3973       base_min_diff = 8;
3974     } else {
3975       if (base_min_diff < 0) base_min_diff = 0;
3976     }
3977 
3978     if (base_shift > 7) {
3979       a0_x = _mm256_setzero_si256();
3980       a1_x = _mm256_setzero_si256();
3981       shift = _mm256_setzero_si256();
3982     } else {
3983       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3984       ydx = _mm_set1_epi16(y * dx);
3985       r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
3986       if (upsample_above) {
3987         a0_x128 =
3988             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3989         a1_x128 = _mm_srli_si128(a0_x128, 8);
3990 
3991         shift = _mm256_castsi128_si256(_mm_srli_epi16(
3992             _mm_and_si128(
3993                 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3994             1));
3995       } else {
3996         a1_x128 = _mm_srli_si128(a0_x128, 1);
3997         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3998         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
3999 
4000         shift = _mm256_castsi128_si256(
4001             _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
4002       }
4003       a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
4004       a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
4005     }
4006 
4007     // y calc
4008     __m128i a0_y, a1_y, shifty;
4009     if (base_x < min_base_x) {
4010       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4011       __m128i y_c128, base_y_c128, mask128;
4012       r6 = _mm_set1_epi16(r << 6);
4013       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
4014       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
4015       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
4016       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
4017       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4018 
4019       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4020                             left[base_y_c[2]], left[base_y_c[3]],
4021                             left[base_y_c[4]], left[base_y_c[5]],
4022                             left[base_y_c[6]], left[base_y_c[7]]);
4023       base_y_c128 = _mm_add_epi16(
4024           base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
4025       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4026 
4027       a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4028                             left[base_y_c[2]], left[base_y_c[3]],
4029                             left[base_y_c[4]], left[base_y_c[5]],
4030                             left[base_y_c[6]], left[base_y_c[7]]);
4031 
4032       if (upsample_left) {
4033         shifty = _mm_srli_epi16(
4034             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
4035       } else {
4036         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
4037       }
4038 
4039       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
4040       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
4041       shift = _mm256_inserti128_si256(shift, shifty, 1);
4042     }
4043 
4044     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4045     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4046     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4047 
4048     b = _mm256_mullo_epi16(diff, shift);
4049     res = _mm256_add_epi16(a32, b);
4050     res = _mm256_srli_epi16(res, 5);
4051 
4052     resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
4053                             _mm256_castsi256_si128(res));
4054     resy = _mm256_extracti128_si256(res, 1);
4055     resy = _mm_packus_epi16(resy, resy);
4056 
4057     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4058     _mm_storel_epi64((__m128i *)(dst), resxy);
4059     dst += stride;
4060   }
4061 }
4062 
dr_prediction_z2_HxW_avx2(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4063 static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
4064                                       ptrdiff_t stride, const uint8_t *above,
4065                                       const uint8_t *left, int upsample_above,
4066                                       int upsample_left, int dx, int dy) {
4067   // here upsample_above and upsample_left are 0 by design of
4068   // av1_use_intra_edge_upsample
4069   const int min_base_x = -1;
4070   const int min_base_y = -1;
4071   (void)upsample_above;
4072   (void)upsample_left;
4073   const int frac_bits_x = 6;
4074   const int frac_bits_y = 6;
4075 
4076   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
4077   __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
4078   __m128i a0_x128, a1_x128;
4079 
4080   DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4081   a16 = _mm256_set1_epi16(16);
4082   c1 = _mm256_srli_epi16(a16, 4);
4083   min_base_y256 = _mm256_set1_epi16(min_base_y);
4084   c3f = _mm256_set1_epi16(0x3f);
4085   dy256 = _mm256_set1_epi16(dy);
4086   c0123 =
4087       _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4088   c1234 = _mm256_add_epi16(c0123, c1);
4089 
4090   for (int r = 0; r < H; r++) {
4091     __m256i b, res, shift, j256, r6, ydx;
4092     __m128i resx, resy;
4093     __m128i resxy;
4094     int y = r + 1;
4095     ydx = _mm256_set1_epi16((int16_t)(y * dx));
4096 
4097     int base_x = (-y * dx) >> frac_bits_x;
4098     for (int j = 0; j < W; j += 16) {
4099       j256 = _mm256_set1_epi16(j);
4100       int base_shift = 0;
4101       if ((base_x + j) < (min_base_x - 1)) {
4102         base_shift = (min_base_x - (base_x + j) - 1);
4103       }
4104       int base_min_diff = (min_base_x - base_x - j);
4105       if (base_min_diff > 16) {
4106         base_min_diff = 16;
4107       } else {
4108         if (base_min_diff < 0) base_min_diff = 0;
4109       }
4110 
4111       if (base_shift < 16) {
4112         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
4113         a1_x128 =
4114             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
4115         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4116         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4117 
4118         a0_x = _mm256_cvtepu8_epi16(a0_x128);
4119         a1_x = _mm256_cvtepu8_epi16(a1_x128);
4120 
4121         r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
4122         shift = _mm256_srli_epi16(
4123             _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
4124 
4125         diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
4126         a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
4127         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4128 
4129         b = _mm256_mullo_epi16(diff, shift);
4130         res = _mm256_add_epi16(a32, b);
4131         res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4132         resx = _mm256_castsi256_si128(_mm256_packus_epi16(
4133             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4134       } else {
4135         resx = _mm_setzero_si128();
4136       }
4137 
4138       // y calc
4139       if (base_x < min_base_x) {
4140         __m256i c256, y_c256, base_y_c256, mask256, mul16;
4141         r6 = _mm256_set1_epi16(r << 6);
4142         c256 = _mm256_add_epi16(j256, c1234);
4143         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
4144                                  _mm256_srli_epi16(min_base_y256, 1));
4145         y_c256 = _mm256_sub_epi16(r6, mul16);
4146 
4147         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
4148         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
4149 
4150         base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
4151         int16_t min_y = (int16_t)_mm_extract_epi16(
4152             _mm256_extracti128_si256(base_y_c256, 1), 7);
4153         int16_t max_y =
4154             (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
4155         int16_t offset_diff = max_y - min_y;
4156 
4157         if (offset_diff < 16) {
4158           __m256i min_y256 = _mm256_set1_epi16(min_y);
4159 
4160           __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
4161           __m128i base_y_offset128 =
4162               _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
4163                               _mm256_extracti128_si256(base_y_offset, 1));
4164 
4165           __m128i a0_y128 = _mm_maskload_epi32(
4166               (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
4167           __m128i a1_y128 =
4168               _mm_maskload_epi32((int *)(left + min_y + 1),
4169                                  *(__m128i *)LoadMaskz2[offset_diff / 4]);
4170           a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
4171           a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
4172           a0_y = _mm256_cvtepu8_epi16(a0_y128);
4173           a1_y = _mm256_cvtepu8_epi16(a1_y128);
4174         } else {
4175           base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
4176           _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4177 
4178           a0_y = _mm256_setr_epi16(
4179               left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4180               left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4181               left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4182               left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4183               left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4184               left[base_y_c[15]]);
4185           base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
4186           _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4187 
4188           a1_y = _mm256_setr_epi16(
4189               left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4190               left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4191               left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4192               left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4193               left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4194               left[base_y_c[15]]);
4195         }
4196         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
4197 
4198         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
4199         a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
4200         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
4201 
4202         b = _mm256_mullo_epi16(diff, shifty);
4203         res = _mm256_add_epi16(a32, b);
4204         res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
4205         resy = _mm256_castsi256_si128(_mm256_packus_epi16(
4206             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4207       } else {
4208         resy = _mm_setzero_si128();
4209       }
4210       resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4211       _mm_storeu_si128((__m128i *)(dst + j), resxy);
4212     }  // for j
4213     dst += stride;
4214   }
4215 }
4216 
4217 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4218 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4219                                const uint8_t *above, const uint8_t *left,
4220                                int upsample_above, int upsample_left, int dx,
4221                                int dy) {
4222   assert(dx > 0);
4223   assert(dy > 0);
4224   switch (bw) {
4225     case 4:
4226       dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
4227                                 upsample_left, dx, dy);
4228       break;
4229     case 8:
4230       dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
4231                                 upsample_left, dx, dy);
4232       break;
4233     default:
4234       dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
4235                                 upsample_above, upsample_left, dx, dy);
4236       break;
4237   }
4238   return;
4239 }
4240 
4241 // z3 functions
transpose16x32_avx2(__m256i * x,__m256i * d)4242 static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
4243   __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
4244   __m256i w10, w11, w12, w13, w14, w15;
4245 
4246   w0 = _mm256_unpacklo_epi8(x[0], x[1]);
4247   w1 = _mm256_unpacklo_epi8(x[2], x[3]);
4248   w2 = _mm256_unpacklo_epi8(x[4], x[5]);
4249   w3 = _mm256_unpacklo_epi8(x[6], x[7]);
4250 
4251   w8 = _mm256_unpacklo_epi8(x[8], x[9]);
4252   w9 = _mm256_unpacklo_epi8(x[10], x[11]);
4253   w10 = _mm256_unpacklo_epi8(x[12], x[13]);
4254   w11 = _mm256_unpacklo_epi8(x[14], x[15]);
4255 
4256   w4 = _mm256_unpacklo_epi16(w0, w1);
4257   w5 = _mm256_unpacklo_epi16(w2, w3);
4258   w12 = _mm256_unpacklo_epi16(w8, w9);
4259   w13 = _mm256_unpacklo_epi16(w10, w11);
4260 
4261   w6 = _mm256_unpacklo_epi32(w4, w5);
4262   w7 = _mm256_unpackhi_epi32(w4, w5);
4263   w14 = _mm256_unpacklo_epi32(w12, w13);
4264   w15 = _mm256_unpackhi_epi32(w12, w13);
4265 
4266   // Store first 4-line result
4267   d[0] = _mm256_unpacklo_epi64(w6, w14);
4268   d[1] = _mm256_unpackhi_epi64(w6, w14);
4269   d[2] = _mm256_unpacklo_epi64(w7, w15);
4270   d[3] = _mm256_unpackhi_epi64(w7, w15);
4271 
4272   w4 = _mm256_unpackhi_epi16(w0, w1);
4273   w5 = _mm256_unpackhi_epi16(w2, w3);
4274   w12 = _mm256_unpackhi_epi16(w8, w9);
4275   w13 = _mm256_unpackhi_epi16(w10, w11);
4276 
4277   w6 = _mm256_unpacklo_epi32(w4, w5);
4278   w7 = _mm256_unpackhi_epi32(w4, w5);
4279   w14 = _mm256_unpacklo_epi32(w12, w13);
4280   w15 = _mm256_unpackhi_epi32(w12, w13);
4281 
4282   // Store second 4-line result
4283   d[4] = _mm256_unpacklo_epi64(w6, w14);
4284   d[5] = _mm256_unpackhi_epi64(w6, w14);
4285   d[6] = _mm256_unpacklo_epi64(w7, w15);
4286   d[7] = _mm256_unpackhi_epi64(w7, w15);
4287 
4288   // upper half
4289   w0 = _mm256_unpackhi_epi8(x[0], x[1]);
4290   w1 = _mm256_unpackhi_epi8(x[2], x[3]);
4291   w2 = _mm256_unpackhi_epi8(x[4], x[5]);
4292   w3 = _mm256_unpackhi_epi8(x[6], x[7]);
4293 
4294   w8 = _mm256_unpackhi_epi8(x[8], x[9]);
4295   w9 = _mm256_unpackhi_epi8(x[10], x[11]);
4296   w10 = _mm256_unpackhi_epi8(x[12], x[13]);
4297   w11 = _mm256_unpackhi_epi8(x[14], x[15]);
4298 
4299   w4 = _mm256_unpacklo_epi16(w0, w1);
4300   w5 = _mm256_unpacklo_epi16(w2, w3);
4301   w12 = _mm256_unpacklo_epi16(w8, w9);
4302   w13 = _mm256_unpacklo_epi16(w10, w11);
4303 
4304   w6 = _mm256_unpacklo_epi32(w4, w5);
4305   w7 = _mm256_unpackhi_epi32(w4, w5);
4306   w14 = _mm256_unpacklo_epi32(w12, w13);
4307   w15 = _mm256_unpackhi_epi32(w12, w13);
4308 
4309   // Store first 4-line result
4310   d[8] = _mm256_unpacklo_epi64(w6, w14);
4311   d[9] = _mm256_unpackhi_epi64(w6, w14);
4312   d[10] = _mm256_unpacklo_epi64(w7, w15);
4313   d[11] = _mm256_unpackhi_epi64(w7, w15);
4314 
4315   w4 = _mm256_unpackhi_epi16(w0, w1);
4316   w5 = _mm256_unpackhi_epi16(w2, w3);
4317   w12 = _mm256_unpackhi_epi16(w8, w9);
4318   w13 = _mm256_unpackhi_epi16(w10, w11);
4319 
4320   w6 = _mm256_unpacklo_epi32(w4, w5);
4321   w7 = _mm256_unpackhi_epi32(w4, w5);
4322   w14 = _mm256_unpacklo_epi32(w12, w13);
4323   w15 = _mm256_unpackhi_epi32(w12, w13);
4324 
4325   // Store second 4-line result
4326   d[12] = _mm256_unpacklo_epi64(w6, w14);
4327   d[13] = _mm256_unpackhi_epi64(w6, w14);
4328   d[14] = _mm256_unpacklo_epi64(w7, w15);
4329   d[15] = _mm256_unpackhi_epi64(w7, w15);
4330 }
4331 
dr_prediction_z3_4x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4332 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
4333                                       const uint8_t *left, int upsample_left,
4334                                       int dy) {
4335   __m128i dstvec[4], d[4];
4336 
4337   dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
4338   transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4339                             &d[0], &d[1], &d[2], &d[3]);
4340 
4341   *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
4342   *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
4343   *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
4344   *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
4345   return;
4346 }
4347 
dr_prediction_z3_8x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4348 static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
4349                                       const uint8_t *left, int upsample_left,
4350                                       int dy) {
4351   __m128i dstvec[8], d[8];
4352 
4353   dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
4354   transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
4355                     &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
4356                     &d[3]);
4357 
4358   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4359   _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
4360   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
4361   _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
4362   _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
4363   _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
4364   _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
4365   _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
4366 }
4367 
dr_prediction_z3_4x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4368 static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
4369                                       const uint8_t *left, int upsample_left,
4370                                       int dy) {
4371   __m128i dstvec[4], d[8];
4372 
4373   dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
4374   transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
4375                         &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
4376   for (int i = 0; i < 8; i++) {
4377     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4378   }
4379 }
4380 
dr_prediction_z3_8x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4381 static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
4382                                       const uint8_t *left, int upsample_left,
4383                                       int dy) {
4384   __m128i dstvec[8], d[4];
4385 
4386   dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
4387   transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4388                         &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
4389                         &d[1], &d[2], &d[3]);
4390   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4391   _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
4392   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
4393   _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
4394 }
4395 
dr_prediction_z3_8x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4396 static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
4397                                        const uint8_t *left, int upsample_left,
4398                                        int dy) {
4399   __m128i dstvec[8], d[8];
4400 
4401   dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
4402   transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
4403                           dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
4404                           d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
4405   for (int i = 0; i < 8; i++) {
4406     _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
4407     _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
4408                      _mm_srli_si128(d[i], 8));
4409   }
4410 }
4411 
dr_prediction_z3_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4412 static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4413                                        const uint8_t *left, int upsample_left,
4414                                        int dy) {
4415   __m128i dstvec[16], d[16];
4416 
4417   dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
4418   transpose16x8_8x16_sse2(
4419       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4420       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4421       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4422       &d[3], &d[4], &d[5], &d[6], &d[7]);
4423 
4424   for (int i = 0; i < 8; i++) {
4425     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4426   }
4427 }
4428 
dr_prediction_z3_4x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4429 static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
4430                                        const uint8_t *left, int upsample_left,
4431                                        int dy) {
4432   __m128i dstvec[4], d[16];
4433 
4434   dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
4435   transpose4x16_sse2(dstvec, d);
4436   for (int i = 0; i < 16; i++) {
4437     *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4438   }
4439 }
4440 
dr_prediction_z3_16x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4441 static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
4442                                        const uint8_t *left, int upsample_left,
4443                                        int dy) {
4444   __m128i dstvec[16], d[8];
4445 
4446   dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
4447   for (int i = 4; i < 8; i++) {
4448     d[i] = _mm_setzero_si128();
4449   }
4450   transpose16x8_8x16_sse2(
4451       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4452       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4453       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4454       &d[3], &d[4], &d[5], &d[6], &d[7]);
4455 
4456   for (int i = 0; i < 4; i++) {
4457     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4458   }
4459 }
4460 
dr_prediction_z3_8x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4461 static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
4462                                        const uint8_t *left, int upsample_left,
4463                                        int dy) {
4464   __m256i dstvec[16], d[16];
4465 
4466   dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
4467   for (int i = 8; i < 16; i++) {
4468     dstvec[i] = _mm256_setzero_si256();
4469   }
4470   transpose16x32_avx2(dstvec, d);
4471 
4472   for (int i = 0; i < 16; i++) {
4473     _mm_storel_epi64((__m128i *)(dst + i * stride),
4474                      _mm256_castsi256_si128(d[i]));
4475   }
4476   for (int i = 0; i < 16; i++) {
4477     _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
4478                      _mm256_extracti128_si256(d[i], 1));
4479   }
4480 }
4481 
dr_prediction_z3_32x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4482 static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
4483                                        const uint8_t *left, int upsample_left,
4484                                        int dy) {
4485   __m128i dstvec[32], d[16];
4486 
4487   dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
4488 
4489   transpose16x8_8x16_sse2(
4490       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4491       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4492       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4493       &d[3], &d[4], &d[5], &d[6], &d[7]);
4494   transpose16x8_8x16_sse2(
4495       &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
4496       &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
4497       &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
4498       &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
4499       &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
4500       &d[6 + 8], &d[7 + 8]);
4501 
4502   for (int i = 0; i < 8; i++) {
4503     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4504     _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
4505   }
4506 }
4507 
dr_prediction_z3_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4508 static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4509                                         const uint8_t *left, int upsample_left,
4510                                         int dy) {
4511   __m128i dstvec[16], d[16];
4512 
4513   dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
4514   transpose16x16_sse2(dstvec, d);
4515 
4516   for (int i = 0; i < 16; i++) {
4517     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4518   }
4519 }
4520 
dr_prediction_z3_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4521 static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4522                                         const uint8_t *left, int upsample_left,
4523                                         int dy) {
4524   __m256i dstvec[32], d[32];
4525 
4526   dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
4527   transpose16x32_avx2(dstvec, d);
4528   transpose16x32_avx2(dstvec + 16, d + 16);
4529   for (int j = 0; j < 16; j++) {
4530     _mm_storeu_si128((__m128i *)(dst + j * stride),
4531                      _mm256_castsi256_si128(d[j]));
4532     _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
4533                      _mm256_castsi256_si128(d[j + 16]));
4534   }
4535   for (int j = 0; j < 16; j++) {
4536     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4537                      _mm256_extracti128_si256(d[j], 1));
4538     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
4539                      _mm256_extracti128_si256(d[j + 16], 1));
4540   }
4541 }
4542 
dr_prediction_z3_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4543 static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4544                                         const uint8_t *left, int upsample_left,
4545                                         int dy) {
4546   DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
4547   dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4548   transpose(dstT, 64, dst, stride, 64, 64);
4549 }
4550 
dr_prediction_z3_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4551 static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4552                                         const uint8_t *left, int upsample_left,
4553                                         int dy) {
4554   __m256i dstvec[16], d[16];
4555 
4556   dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
4557   transpose16x32_avx2(dstvec, d);
4558   // store
4559   for (int j = 0; j < 16; j++) {
4560     _mm_storeu_si128((__m128i *)(dst + j * stride),
4561                      _mm256_castsi256_si128(d[j]));
4562     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4563                      _mm256_extracti128_si256(d[j], 1));
4564   }
4565 }
4566 
dr_prediction_z3_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4567 static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4568                                         const uint8_t *left, int upsample_left,
4569                                         int dy) {
4570   __m128i dstvec[32], d[16];
4571 
4572   dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
4573   for (int i = 0; i < 32; i += 16) {
4574     transpose16x16_sse2((dstvec + i), d);
4575     for (int j = 0; j < 16; j++) {
4576       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4577     }
4578   }
4579 }
4580 
dr_prediction_z3_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4581 static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4582                                         const uint8_t *left, int upsample_left,
4583                                         int dy) {
4584   uint8_t dstT[64 * 32];
4585   dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4586   transpose(dstT, 64, dst, stride, 32, 64);
4587 }
4588 
dr_prediction_z3_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4589 static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4590                                         const uint8_t *left, int upsample_left,
4591                                         int dy) {
4592   uint8_t dstT[32 * 64];
4593   dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4594   transpose(dstT, 32, dst, stride, 64, 32);
4595   return;
4596 }
4597 
dr_prediction_z3_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4598 static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4599                                         const uint8_t *left, int upsample_left,
4600                                         int dy) {
4601   uint8_t dstT[64 * 16];
4602   dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4603   transpose(dstT, 64, dst, stride, 16, 64);
4604 }
4605 
dr_prediction_z3_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4606 static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4607                                         const uint8_t *left, int upsample_left,
4608                                         int dy) {
4609   __m128i dstvec[64], d[16];
4610 
4611   dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
4612   for (int i = 0; i < 64; i += 16) {
4613     transpose16x16_sse2((dstvec + i), d);
4614     for (int j = 0; j < 16; j++) {
4615       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4616     }
4617   }
4618 }
4619 
av1_dr_prediction_z3_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)4620 void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4621                                const uint8_t *above, const uint8_t *left,
4622                                int upsample_left, int dx, int dy) {
4623   (void)above;
4624   (void)dx;
4625   assert(dx == 1);
4626   assert(dy > 0);
4627 
4628   if (bw == bh) {
4629     switch (bw) {
4630       case 4:
4631         dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4632         break;
4633       case 8:
4634         dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4635         break;
4636       case 16:
4637         dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
4638         break;
4639       case 32:
4640         dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
4641         break;
4642       case 64:
4643         dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
4644         break;
4645     }
4646   } else {
4647     if (bw < bh) {
4648       if (bw + bw == bh) {
4649         switch (bw) {
4650           case 4:
4651             dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
4652             break;
4653           case 8:
4654             dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
4655             break;
4656           case 16:
4657             dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
4658             break;
4659           case 32:
4660             dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
4661             break;
4662         }
4663       } else {
4664         switch (bw) {
4665           case 4:
4666             dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
4667             break;
4668           case 8:
4669             dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
4670             break;
4671           case 16:
4672             dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
4673             break;
4674         }
4675       }
4676     } else {
4677       if (bh + bh == bw) {
4678         switch (bh) {
4679           case 4:
4680             dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
4681             break;
4682           case 8:
4683             dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
4684             break;
4685           case 16:
4686             dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
4687             break;
4688           case 32:
4689             dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
4690             break;
4691         }
4692       } else {
4693         switch (bh) {
4694           case 4:
4695             dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
4696             break;
4697           case 8:
4698             dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
4699             break;
4700           case 16:
4701             dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
4702             break;
4703         }
4704       }
4705     }
4706   }
4707 }
4708