1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <immintrin.h>
13
14 #include "config/av1_rtcd.h"
15 #include "aom_dsp/x86/intrapred_x86.h"
16 #include "aom_dsp/x86/intrapred_utils.h"
17 #include "aom_dsp/x86/lpf_common_sse2.h"
18
dc_sum_64(const uint8_t * ref)19 static INLINE __m256i dc_sum_64(const uint8_t *ref) {
20 const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
21 const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
22 const __m256i zero = _mm256_setzero_si256();
23 __m256i y0 = _mm256_sad_epu8(x0, zero);
24 __m256i y1 = _mm256_sad_epu8(x1, zero);
25 y0 = _mm256_add_epi64(y0, y1);
26 __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
27 y0 = _mm256_add_epi64(u0, y0);
28 u0 = _mm256_unpackhi_epi64(y0, y0);
29 return _mm256_add_epi16(y0, u0);
30 }
31
dc_sum_32(const uint8_t * ref)32 static INLINE __m256i dc_sum_32(const uint8_t *ref) {
33 const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
34 const __m256i zero = _mm256_setzero_si256();
35 __m256i y = _mm256_sad_epu8(x, zero);
36 __m256i u = _mm256_permute2x128_si256(y, y, 1);
37 y = _mm256_add_epi64(u, y);
38 u = _mm256_unpackhi_epi64(y, y);
39 return _mm256_add_epi16(y, u);
40 }
41
row_store_32xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)42 static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
43 ptrdiff_t stride) {
44 for (int i = 0; i < height; ++i) {
45 _mm256_storeu_si256((__m256i *)dst, *r);
46 dst += stride;
47 }
48 }
49
row_store_32x2xh(const __m256i * r0,const __m256i * r1,int height,uint8_t * dst,ptrdiff_t stride)50 static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
51 int height, uint8_t *dst,
52 ptrdiff_t stride) {
53 for (int i = 0; i < height; ++i) {
54 _mm256_storeu_si256((__m256i *)dst, *r0);
55 _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
56 dst += stride;
57 }
58 }
59
row_store_64xh(const __m256i * r,int height,uint8_t * dst,ptrdiff_t stride)60 static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
61 ptrdiff_t stride) {
62 for (int i = 0; i < height; ++i) {
63 _mm256_storeu_si256((__m256i *)dst, *r);
64 _mm256_storeu_si256((__m256i *)(dst + 32), *r);
65 dst += stride;
66 }
67 }
68
69 static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
70 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
71 { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
72 { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
73 { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
74 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
75 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
76 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
77 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
78 };
79
80 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
81 { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
82 { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
83 { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
84 { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
85 };
86
87 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
88 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
89 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
90 { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
91 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
92 { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25,
93 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
94 { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
95 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
96 { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21,
97 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
98 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
99 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
100 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
101 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
102 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
103 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
104 };
105
106 static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
107 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
108 { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
109 { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
110 { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
111 { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
112 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
113 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0 },
115 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
116 0, 0 },
117 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
118 0, 0, 0, 0 },
119 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
120 0, 0, 0, 0, 0, 0 },
121 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
122 0xffff, 0, 0, 0, 0, 0, 0 },
123 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
124 0xffff, 0xffff, 0, 0, 0, 0, 0 },
125 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
126 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
127 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
128 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
129 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
130 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
131 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
132 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
133 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
134 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
135 };
136
highbd_transpose16x4_8x8_sse2(__m128i * x,__m128i * d)137 static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
138 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
139
140 r0 = _mm_unpacklo_epi16(x[0], x[1]);
141 r1 = _mm_unpacklo_epi16(x[2], x[3]);
142 r2 = _mm_unpacklo_epi16(x[4], x[5]);
143 r3 = _mm_unpacklo_epi16(x[6], x[7]);
144
145 r4 = _mm_unpacklo_epi16(x[8], x[9]);
146 r5 = _mm_unpacklo_epi16(x[10], x[11]);
147 r6 = _mm_unpacklo_epi16(x[12], x[13]);
148 r7 = _mm_unpacklo_epi16(x[14], x[15]);
149
150 r8 = _mm_unpacklo_epi32(r0, r1);
151 r9 = _mm_unpackhi_epi32(r0, r1);
152 r10 = _mm_unpacklo_epi32(r2, r3);
153 r11 = _mm_unpackhi_epi32(r2, r3);
154
155 r12 = _mm_unpacklo_epi32(r4, r5);
156 r13 = _mm_unpackhi_epi32(r4, r5);
157 r14 = _mm_unpacklo_epi32(r6, r7);
158 r15 = _mm_unpackhi_epi32(r6, r7);
159
160 r0 = _mm_unpacklo_epi64(r8, r9);
161 r1 = _mm_unpackhi_epi64(r8, r9);
162 r2 = _mm_unpacklo_epi64(r10, r11);
163 r3 = _mm_unpackhi_epi64(r10, r11);
164
165 r4 = _mm_unpacklo_epi64(r12, r13);
166 r5 = _mm_unpackhi_epi64(r12, r13);
167 r6 = _mm_unpacklo_epi64(r14, r15);
168 r7 = _mm_unpackhi_epi64(r14, r15);
169
170 d[0] = _mm_unpacklo_epi64(r0, r2);
171 d[1] = _mm_unpacklo_epi64(r4, r6);
172 d[2] = _mm_unpacklo_epi64(r1, r3);
173 d[3] = _mm_unpacklo_epi64(r5, r7);
174
175 d[4] = _mm_unpackhi_epi64(r0, r2);
176 d[5] = _mm_unpackhi_epi64(r4, r6);
177 d[6] = _mm_unpackhi_epi64(r1, r3);
178 d[7] = _mm_unpackhi_epi64(r5, r7);
179 }
180
highbd_transpose4x16_avx2(__m256i * x,__m256i * d)181 static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
182 __m256i w0, w1, w2, w3, ww0, ww1;
183
184 w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
185 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
186 w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53
187 w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73
188
189 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
190 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
191
192 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
193 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
194
195 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
196 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
197
198 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
199 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
200 }
201
highbd_transpose8x16_16x8_avx2(__m256i * x,__m256i * d)202 static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
203 __m256i w0, w1, w2, w3, ww0, ww1;
204
205 w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
206 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
207 w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53
208 w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73
209
210 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
211 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
212
213 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
214 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
215
216 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
217 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
218
219 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
220 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
221
222 w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17
223 w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37
224 w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57
225 w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77
226
227 ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
228 ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
229
230 d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
231 d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
232
233 ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
234 ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
235
236 d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
237 d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
238 }
239
highbd_transpose16x16_avx2(__m256i * x,__m256i * d)240 static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
241 __m256i w0, w1, w2, w3, ww0, ww1;
242 __m256i dd[16];
243 w0 = _mm256_unpacklo_epi16(x[0], x[1]);
244 w1 = _mm256_unpacklo_epi16(x[2], x[3]);
245 w2 = _mm256_unpacklo_epi16(x[4], x[5]);
246 w3 = _mm256_unpacklo_epi16(x[6], x[7]);
247
248 ww0 = _mm256_unpacklo_epi32(w0, w1); //
249 ww1 = _mm256_unpacklo_epi32(w2, w3); //
250
251 dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
252 dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
253
254 ww0 = _mm256_unpackhi_epi32(w0, w1); //
255 ww1 = _mm256_unpackhi_epi32(w2, w3); //
256
257 dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
258 dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
259
260 w0 = _mm256_unpackhi_epi16(x[0], x[1]);
261 w1 = _mm256_unpackhi_epi16(x[2], x[3]);
262 w2 = _mm256_unpackhi_epi16(x[4], x[5]);
263 w3 = _mm256_unpackhi_epi16(x[6], x[7]);
264
265 ww0 = _mm256_unpacklo_epi32(w0, w1); //
266 ww1 = _mm256_unpacklo_epi32(w2, w3); //
267
268 dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
269 dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
270
271 ww0 = _mm256_unpackhi_epi32(w0, w1); //
272 ww1 = _mm256_unpackhi_epi32(w2, w3); //
273
274 dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
275 dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
276
277 w0 = _mm256_unpacklo_epi16(x[8], x[9]);
278 w1 = _mm256_unpacklo_epi16(x[10], x[11]);
279 w2 = _mm256_unpacklo_epi16(x[12], x[13]);
280 w3 = _mm256_unpacklo_epi16(x[14], x[15]);
281
282 ww0 = _mm256_unpacklo_epi32(w0, w1);
283 ww1 = _mm256_unpacklo_epi32(w2, w3);
284
285 dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
286 dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
287
288 ww0 = _mm256_unpackhi_epi32(w0, w1);
289 ww1 = _mm256_unpackhi_epi32(w2, w3);
290
291 dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
292 dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
293
294 w0 = _mm256_unpackhi_epi16(x[8], x[9]);
295 w1 = _mm256_unpackhi_epi16(x[10], x[11]);
296 w2 = _mm256_unpackhi_epi16(x[12], x[13]);
297 w3 = _mm256_unpackhi_epi16(x[14], x[15]);
298
299 ww0 = _mm256_unpacklo_epi32(w0, w1);
300 ww1 = _mm256_unpacklo_epi32(w2, w3);
301
302 dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
303 dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
304
305 ww0 = _mm256_unpackhi_epi32(w0, w1);
306 ww1 = _mm256_unpackhi_epi32(w2, w3);
307
308 dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
309 dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
310
311 for (int i = 0; i < 8; i++) {
312 d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
313 d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
314 _mm256_extracti128_si256(dd[i], 1), 0);
315 }
316 }
317
aom_dc_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)318 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
319 const uint8_t *above, const uint8_t *left) {
320 const __m256i sum_above = dc_sum_32(above);
321 __m256i sum_left = dc_sum_32(left);
322 sum_left = _mm256_add_epi16(sum_left, sum_above);
323 const __m256i thirtytwo = _mm256_set1_epi16(32);
324 sum_left = _mm256_add_epi16(sum_left, thirtytwo);
325 sum_left = _mm256_srai_epi16(sum_left, 6);
326 const __m256i zero = _mm256_setzero_si256();
327 __m256i row = _mm256_shuffle_epi8(sum_left, zero);
328 row_store_32xh(&row, 32, dst, stride);
329 }
330
aom_dc_top_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)331 void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
332 const uint8_t *above,
333 const uint8_t *left) {
334 __m256i sum = dc_sum_32(above);
335 (void)left;
336
337 const __m256i sixteen = _mm256_set1_epi16(16);
338 sum = _mm256_add_epi16(sum, sixteen);
339 sum = _mm256_srai_epi16(sum, 5);
340 const __m256i zero = _mm256_setzero_si256();
341 __m256i row = _mm256_shuffle_epi8(sum, zero);
342 row_store_32xh(&row, 32, dst, stride);
343 }
344
aom_dc_left_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)345 void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
346 const uint8_t *above,
347 const uint8_t *left) {
348 __m256i sum = dc_sum_32(left);
349 (void)above;
350
351 const __m256i sixteen = _mm256_set1_epi16(16);
352 sum = _mm256_add_epi16(sum, sixteen);
353 sum = _mm256_srai_epi16(sum, 5);
354 const __m256i zero = _mm256_setzero_si256();
355 __m256i row = _mm256_shuffle_epi8(sum, zero);
356 row_store_32xh(&row, 32, dst, stride);
357 }
358
aom_dc_128_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)359 void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
360 const uint8_t *above,
361 const uint8_t *left) {
362 (void)above;
363 (void)left;
364 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
365 row_store_32xh(&row, 32, dst, stride);
366 }
367
aom_v_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)368 void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
369 const uint8_t *above, const uint8_t *left) {
370 const __m256i row = _mm256_loadu_si256((const __m256i *)above);
371 (void)left;
372 row_store_32xh(&row, 32, dst, stride);
373 }
374
375 // There are 32 rows togeter. This function does line:
376 // 0,1,2,3, and 16,17,18,19. The next call would do
377 // 4,5,6,7, and 20,21,22,23. So 4 times of calling
378 // would finish 32 rows.
h_predictor_32x8line(const __m256i * row,uint8_t * dst,ptrdiff_t stride)379 static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
380 ptrdiff_t stride) {
381 __m256i t[4];
382 __m256i m = _mm256_setzero_si256();
383 const __m256i inc = _mm256_set1_epi8(4);
384 int i;
385
386 for (i = 0; i < 4; i++) {
387 t[i] = _mm256_shuffle_epi8(*row, m);
388 __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
389 __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
390 _mm256_storeu_si256((__m256i *)dst, r0);
391 _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
392 dst += stride;
393 m = _mm256_add_epi8(m, inc);
394 }
395 }
396
aom_h_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)397 void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
398 const uint8_t *above, const uint8_t *left) {
399 (void)above;
400 const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
401
402 __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
403
404 __m256i v = _mm256_unpacklo_epi8(u, u);
405 h_predictor_32x8line(&v, dst, stride);
406 dst += stride << 2;
407
408 v = _mm256_unpackhi_epi8(u, u);
409 h_predictor_32x8line(&v, dst, stride);
410 dst += stride << 2;
411
412 u = _mm256_unpackhi_epi8(left_col, left_col);
413
414 v = _mm256_unpacklo_epi8(u, u);
415 h_predictor_32x8line(&v, dst, stride);
416 dst += stride << 2;
417
418 v = _mm256_unpackhi_epi8(u, u);
419 h_predictor_32x8line(&v, dst, stride);
420 }
421
422 // -----------------------------------------------------------------------------
423 // Rectangle
aom_dc_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)424 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
425 const uint8_t *above, const uint8_t *left) {
426 const __m128i top_sum = dc_sum_32_sse2(above);
427 __m128i left_sum = dc_sum_16_sse2(left);
428 left_sum = _mm_add_epi16(top_sum, left_sum);
429 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
430 sum += 24;
431 sum /= 48;
432 const __m256i row = _mm256_set1_epi8((int8_t)sum);
433 row_store_32xh(&row, 16, dst, stride);
434 }
435
aom_dc_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)436 void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
437 const uint8_t *above, const uint8_t *left) {
438 const __m256i sum_above = dc_sum_32(above);
439 __m256i sum_left = dc_sum_64(left);
440 sum_left = _mm256_add_epi16(sum_left, sum_above);
441 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
442 sum += 48;
443 sum /= 96;
444 const __m256i row = _mm256_set1_epi8((int8_t)sum);
445 row_store_32xh(&row, 64, dst, stride);
446 }
447
aom_dc_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)448 void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
449 const uint8_t *above, const uint8_t *left) {
450 const __m256i sum_above = dc_sum_64(above);
451 __m256i sum_left = dc_sum_64(left);
452 sum_left = _mm256_add_epi16(sum_left, sum_above);
453 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
454 sum += 64;
455 sum /= 128;
456 const __m256i row = _mm256_set1_epi8((int8_t)sum);
457 row_store_64xh(&row, 64, dst, stride);
458 }
459
aom_dc_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)460 void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
461 const uint8_t *above, const uint8_t *left) {
462 const __m256i sum_above = dc_sum_64(above);
463 __m256i sum_left = dc_sum_32(left);
464 sum_left = _mm256_add_epi16(sum_left, sum_above);
465 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
466 sum += 48;
467 sum /= 96;
468 const __m256i row = _mm256_set1_epi8((int8_t)sum);
469 row_store_64xh(&row, 32, dst, stride);
470 }
471
aom_dc_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)472 void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
473 const uint8_t *above, const uint8_t *left) {
474 const __m256i sum_above = dc_sum_64(above);
475 __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
476 sum_left = _mm256_add_epi16(sum_left, sum_above);
477 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
478 sum += 40;
479 sum /= 80;
480 const __m256i row = _mm256_set1_epi8((int8_t)sum);
481 row_store_64xh(&row, 16, dst, stride);
482 }
483
aom_dc_top_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)484 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
485 const uint8_t *above,
486 const uint8_t *left) {
487 __m256i sum = dc_sum_32(above);
488 (void)left;
489
490 const __m256i sixteen = _mm256_set1_epi16(16);
491 sum = _mm256_add_epi16(sum, sixteen);
492 sum = _mm256_srai_epi16(sum, 5);
493 const __m256i zero = _mm256_setzero_si256();
494 __m256i row = _mm256_shuffle_epi8(sum, zero);
495 row_store_32xh(&row, 16, dst, stride);
496 }
497
aom_dc_top_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)498 void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
499 const uint8_t *above,
500 const uint8_t *left) {
501 __m256i sum = dc_sum_32(above);
502 (void)left;
503
504 const __m256i sixteen = _mm256_set1_epi16(16);
505 sum = _mm256_add_epi16(sum, sixteen);
506 sum = _mm256_srai_epi16(sum, 5);
507 const __m256i zero = _mm256_setzero_si256();
508 __m256i row = _mm256_shuffle_epi8(sum, zero);
509 row_store_32xh(&row, 64, dst, stride);
510 }
511
aom_dc_top_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)512 void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
513 const uint8_t *above,
514 const uint8_t *left) {
515 __m256i sum = dc_sum_64(above);
516 (void)left;
517
518 const __m256i thirtytwo = _mm256_set1_epi16(32);
519 sum = _mm256_add_epi16(sum, thirtytwo);
520 sum = _mm256_srai_epi16(sum, 6);
521 const __m256i zero = _mm256_setzero_si256();
522 __m256i row = _mm256_shuffle_epi8(sum, zero);
523 row_store_64xh(&row, 64, dst, stride);
524 }
525
aom_dc_top_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)526 void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
527 const uint8_t *above,
528 const uint8_t *left) {
529 __m256i sum = dc_sum_64(above);
530 (void)left;
531
532 const __m256i thirtytwo = _mm256_set1_epi16(32);
533 sum = _mm256_add_epi16(sum, thirtytwo);
534 sum = _mm256_srai_epi16(sum, 6);
535 const __m256i zero = _mm256_setzero_si256();
536 __m256i row = _mm256_shuffle_epi8(sum, zero);
537 row_store_64xh(&row, 32, dst, stride);
538 }
539
aom_dc_top_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)540 void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
541 const uint8_t *above,
542 const uint8_t *left) {
543 __m256i sum = dc_sum_64(above);
544 (void)left;
545
546 const __m256i thirtytwo = _mm256_set1_epi16(32);
547 sum = _mm256_add_epi16(sum, thirtytwo);
548 sum = _mm256_srai_epi16(sum, 6);
549 const __m256i zero = _mm256_setzero_si256();
550 __m256i row = _mm256_shuffle_epi8(sum, zero);
551 row_store_64xh(&row, 16, dst, stride);
552 }
553
aom_dc_left_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)554 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
555 const uint8_t *above,
556 const uint8_t *left) {
557 __m128i sum = dc_sum_16_sse2(left);
558 (void)above;
559
560 const __m128i eight = _mm_set1_epi16(8);
561 sum = _mm_add_epi16(sum, eight);
562 sum = _mm_srai_epi16(sum, 4);
563 const __m128i zero = _mm_setzero_si128();
564 const __m128i r = _mm_shuffle_epi8(sum, zero);
565 const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
566 row_store_32xh(&row, 16, dst, stride);
567 }
568
aom_dc_left_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)569 void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
570 const uint8_t *above,
571 const uint8_t *left) {
572 __m256i sum = dc_sum_64(left);
573 (void)above;
574
575 const __m256i thirtytwo = _mm256_set1_epi16(32);
576 sum = _mm256_add_epi16(sum, thirtytwo);
577 sum = _mm256_srai_epi16(sum, 6);
578 const __m256i zero = _mm256_setzero_si256();
579 __m256i row = _mm256_shuffle_epi8(sum, zero);
580 row_store_32xh(&row, 64, dst, stride);
581 }
582
aom_dc_left_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)583 void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
584 const uint8_t *above,
585 const uint8_t *left) {
586 __m256i sum = dc_sum_64(left);
587 (void)above;
588
589 const __m256i thirtytwo = _mm256_set1_epi16(32);
590 sum = _mm256_add_epi16(sum, thirtytwo);
591 sum = _mm256_srai_epi16(sum, 6);
592 const __m256i zero = _mm256_setzero_si256();
593 __m256i row = _mm256_shuffle_epi8(sum, zero);
594 row_store_64xh(&row, 64, dst, stride);
595 }
596
aom_dc_left_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)597 void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
598 const uint8_t *above,
599 const uint8_t *left) {
600 __m256i sum = dc_sum_32(left);
601 (void)above;
602
603 const __m256i sixteen = _mm256_set1_epi16(16);
604 sum = _mm256_add_epi16(sum, sixteen);
605 sum = _mm256_srai_epi16(sum, 5);
606 const __m256i zero = _mm256_setzero_si256();
607 __m256i row = _mm256_shuffle_epi8(sum, zero);
608 row_store_64xh(&row, 32, dst, stride);
609 }
610
aom_dc_left_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)611 void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
612 const uint8_t *above,
613 const uint8_t *left) {
614 __m128i sum = dc_sum_16_sse2(left);
615 (void)above;
616
617 const __m128i eight = _mm_set1_epi16(8);
618 sum = _mm_add_epi16(sum, eight);
619 sum = _mm_srai_epi16(sum, 4);
620 const __m128i zero = _mm_setzero_si128();
621 const __m128i r = _mm_shuffle_epi8(sum, zero);
622 const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
623 row_store_64xh(&row, 16, dst, stride);
624 }
625
aom_dc_128_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)626 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
627 const uint8_t *above,
628 const uint8_t *left) {
629 (void)above;
630 (void)left;
631 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
632 row_store_32xh(&row, 16, dst, stride);
633 }
634
aom_dc_128_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)635 void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
636 const uint8_t *above,
637 const uint8_t *left) {
638 (void)above;
639 (void)left;
640 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
641 row_store_32xh(&row, 64, dst, stride);
642 }
643
aom_dc_128_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)644 void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
645 const uint8_t *above,
646 const uint8_t *left) {
647 (void)above;
648 (void)left;
649 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
650 row_store_64xh(&row, 64, dst, stride);
651 }
652
aom_dc_128_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)653 void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
654 const uint8_t *above,
655 const uint8_t *left) {
656 (void)above;
657 (void)left;
658 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
659 row_store_64xh(&row, 32, dst, stride);
660 }
661
aom_dc_128_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)662 void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
663 const uint8_t *above,
664 const uint8_t *left) {
665 (void)above;
666 (void)left;
667 const __m256i row = _mm256_set1_epi8((int8_t)0x80);
668 row_store_64xh(&row, 16, dst, stride);
669 }
670
aom_v_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)671 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
672 const uint8_t *above, const uint8_t *left) {
673 const __m256i row = _mm256_loadu_si256((const __m256i *)above);
674 (void)left;
675 row_store_32xh(&row, 16, dst, stride);
676 }
677
aom_v_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)678 void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
679 const uint8_t *above, const uint8_t *left) {
680 const __m256i row = _mm256_loadu_si256((const __m256i *)above);
681 (void)left;
682 row_store_32xh(&row, 64, dst, stride);
683 }
684
aom_v_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)685 void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
686 const uint8_t *above, const uint8_t *left) {
687 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
688 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
689 (void)left;
690 row_store_32x2xh(&row0, &row1, 64, dst, stride);
691 }
692
aom_v_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)693 void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
694 const uint8_t *above, const uint8_t *left) {
695 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
696 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
697 (void)left;
698 row_store_32x2xh(&row0, &row1, 32, dst, stride);
699 }
700
aom_v_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)701 void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
702 const uint8_t *above, const uint8_t *left) {
703 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
704 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
705 (void)left;
706 row_store_32x2xh(&row0, &row1, 16, dst, stride);
707 }
708
709 // -----------------------------------------------------------------------------
710 // PAETH_PRED
711
712 // Return 16 16-bit pixels in one row (__m256i)
paeth_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)713 static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
714 const __m256i *topleft) {
715 const __m256i base =
716 _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
717
718 __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
719 __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
720 __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
721
722 __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
723 mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
724 __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
725
726 pl = _mm256_andnot_si256(mask1, *left);
727
728 ptl = _mm256_and_si256(mask2, *topleft);
729 pt = _mm256_andnot_si256(mask2, *top);
730 pt = _mm256_or_si256(pt, ptl);
731 pt = _mm256_and_si256(mask1, pt);
732
733 return _mm256_or_si256(pt, pl);
734 }
735
736 // Return 16 8-bit pixels in one row (__m128i)
paeth_16x1_pred(const __m256i * left,const __m256i * top,const __m256i * topleft)737 static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
738 const __m256i *topleft) {
739 const __m256i p0 = paeth_pred(left, top, topleft);
740 const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
741 const __m256i p = _mm256_packus_epi16(p0, p1);
742 return _mm256_castsi256_si128(p);
743 }
744
get_top_vector(const uint8_t * above)745 static INLINE __m256i get_top_vector(const uint8_t *above) {
746 const __m128i x = _mm_load_si128((const __m128i *)above);
747 const __m128i zero = _mm_setzero_si128();
748 const __m128i t0 = _mm_unpacklo_epi8(x, zero);
749 const __m128i t1 = _mm_unpackhi_epi8(x, zero);
750 return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
751 }
752
aom_paeth_predictor_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)753 void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
754 const uint8_t *above, const uint8_t *left) {
755 __m128i x = _mm_loadl_epi64((const __m128i *)left);
756 const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
757 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
758 __m256i rep = _mm256_set1_epi16((short)0x8000);
759 const __m256i one = _mm256_set1_epi16(1);
760 const __m256i top = get_top_vector(above);
761
762 int i;
763 for (i = 0; i < 8; ++i) {
764 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
765 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
766
767 _mm_store_si128((__m128i *)dst, row);
768 dst += stride;
769 rep = _mm256_add_epi16(rep, one);
770 }
771 }
772
get_left_vector(const uint8_t * left)773 static INLINE __m256i get_left_vector(const uint8_t *left) {
774 const __m128i x = _mm_load_si128((const __m128i *)left);
775 return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
776 }
777
aom_paeth_predictor_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)778 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
779 const uint8_t *above, const uint8_t *left) {
780 const __m256i l = get_left_vector(left);
781 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
782 __m256i rep = _mm256_set1_epi16((short)0x8000);
783 const __m256i one = _mm256_set1_epi16(1);
784 const __m256i top = get_top_vector(above);
785
786 int i;
787 for (i = 0; i < 16; ++i) {
788 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
789 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
790
791 _mm_store_si128((__m128i *)dst, row);
792 dst += stride;
793 rep = _mm256_add_epi16(rep, one);
794 }
795 }
796
aom_paeth_predictor_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)797 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
798 const uint8_t *above, const uint8_t *left) {
799 __m256i l = get_left_vector(left);
800 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
801 __m256i rep = _mm256_set1_epi16((short)0x8000);
802 const __m256i one = _mm256_set1_epi16(1);
803 const __m256i top = get_top_vector(above);
804
805 int i;
806 for (i = 0; i < 16; ++i) {
807 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
808 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
809
810 _mm_store_si128((__m128i *)dst, row);
811 dst += stride;
812 rep = _mm256_add_epi16(rep, one);
813 }
814
815 l = get_left_vector(left + 16);
816 rep = _mm256_set1_epi16((short)0x8000);
817 for (i = 0; i < 16; ++i) {
818 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
819 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
820
821 _mm_store_si128((__m128i *)dst, row);
822 dst += stride;
823 rep = _mm256_add_epi16(rep, one);
824 }
825 }
826
aom_paeth_predictor_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)827 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
828 const uint8_t *above, const uint8_t *left) {
829 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
830 const __m256i one = _mm256_set1_epi16(1);
831 const __m256i top = get_top_vector(above);
832
833 for (int j = 0; j < 4; ++j) {
834 const __m256i l = get_left_vector(left + j * 16);
835 __m256i rep = _mm256_set1_epi16((short)0x8000);
836 for (int i = 0; i < 16; ++i) {
837 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
838 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
839
840 _mm_store_si128((__m128i *)dst, row);
841 dst += stride;
842 rep = _mm256_add_epi16(rep, one);
843 }
844 }
845 }
846
847 // Return 32 8-bit pixels in one row (__m256i)
paeth_32x1_pred(const __m256i * left,const __m256i * top0,const __m256i * top1,const __m256i * topleft)848 static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
849 const __m256i *top1,
850 const __m256i *topleft) {
851 __m256i p0 = paeth_pred(left, top0, topleft);
852 __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
853 const __m256i x0 = _mm256_packus_epi16(p0, p1);
854
855 p0 = paeth_pred(left, top1, topleft);
856 p1 = _mm256_permute4x64_epi64(p0, 0xe);
857 const __m256i x1 = _mm256_packus_epi16(p0, p1);
858
859 return _mm256_permute2x128_si256(x0, x1, 0x20);
860 }
861
aom_paeth_predictor_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)862 void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
863 const uint8_t *above, const uint8_t *left) {
864 const __m256i l = get_left_vector(left);
865 const __m256i t0 = get_top_vector(above);
866 const __m256i t1 = get_top_vector(above + 16);
867 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
868 __m256i rep = _mm256_set1_epi16((short)0x8000);
869 const __m256i one = _mm256_set1_epi16(1);
870
871 int i;
872 for (i = 0; i < 16; ++i) {
873 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
874
875 const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
876
877 _mm256_storeu_si256((__m256i *)dst, r);
878
879 dst += stride;
880 rep = _mm256_add_epi16(rep, one);
881 }
882 }
883
aom_paeth_predictor_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)884 void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
885 const uint8_t *above, const uint8_t *left) {
886 __m256i l = get_left_vector(left);
887 const __m256i t0 = get_top_vector(above);
888 const __m256i t1 = get_top_vector(above + 16);
889 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
890 __m256i rep = _mm256_set1_epi16((short)0x8000);
891 const __m256i one = _mm256_set1_epi16(1);
892
893 int i;
894 for (i = 0; i < 16; ++i) {
895 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
896
897 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
898 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
899
900 _mm_store_si128((__m128i *)dst, r0);
901 _mm_store_si128((__m128i *)(dst + 16), r1);
902
903 dst += stride;
904 rep = _mm256_add_epi16(rep, one);
905 }
906
907 l = get_left_vector(left + 16);
908 rep = _mm256_set1_epi16((short)0x8000);
909 for (i = 0; i < 16; ++i) {
910 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
911
912 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
913 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
914
915 _mm_store_si128((__m128i *)dst, r0);
916 _mm_store_si128((__m128i *)(dst + 16), r1);
917
918 dst += stride;
919 rep = _mm256_add_epi16(rep, one);
920 }
921 }
922
aom_paeth_predictor_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)923 void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
924 const uint8_t *above, const uint8_t *left) {
925 const __m256i t0 = get_top_vector(above);
926 const __m256i t1 = get_top_vector(above + 16);
927 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
928 const __m256i one = _mm256_set1_epi16(1);
929
930 int i, j;
931 for (j = 0; j < 4; ++j) {
932 const __m256i l = get_left_vector(left + j * 16);
933 __m256i rep = _mm256_set1_epi16((short)0x8000);
934 for (i = 0; i < 16; ++i) {
935 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
936
937 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
938 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
939
940 _mm_store_si128((__m128i *)dst, r0);
941 _mm_store_si128((__m128i *)(dst + 16), r1);
942
943 dst += stride;
944 rep = _mm256_add_epi16(rep, one);
945 }
946 }
947 }
948
aom_paeth_predictor_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)949 void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
950 const uint8_t *above, const uint8_t *left) {
951 const __m256i t0 = get_top_vector(above);
952 const __m256i t1 = get_top_vector(above + 16);
953 const __m256i t2 = get_top_vector(above + 32);
954 const __m256i t3 = get_top_vector(above + 48);
955 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
956 const __m256i one = _mm256_set1_epi16(1);
957
958 int i, j;
959 for (j = 0; j < 2; ++j) {
960 const __m256i l = get_left_vector(left + j * 16);
961 __m256i rep = _mm256_set1_epi16((short)0x8000);
962 for (i = 0; i < 16; ++i) {
963 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
964
965 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
966 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
967 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
968 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
969
970 _mm_store_si128((__m128i *)dst, r0);
971 _mm_store_si128((__m128i *)(dst + 16), r1);
972 _mm_store_si128((__m128i *)(dst + 32), r2);
973 _mm_store_si128((__m128i *)(dst + 48), r3);
974
975 dst += stride;
976 rep = _mm256_add_epi16(rep, one);
977 }
978 }
979 }
980
aom_paeth_predictor_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)981 void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
982 const uint8_t *above, const uint8_t *left) {
983 const __m256i t0 = get_top_vector(above);
984 const __m256i t1 = get_top_vector(above + 16);
985 const __m256i t2 = get_top_vector(above + 32);
986 const __m256i t3 = get_top_vector(above + 48);
987 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
988 const __m256i one = _mm256_set1_epi16(1);
989
990 int i, j;
991 for (j = 0; j < 4; ++j) {
992 const __m256i l = get_left_vector(left + j * 16);
993 __m256i rep = _mm256_set1_epi16((short)0x8000);
994 for (i = 0; i < 16; ++i) {
995 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
996
997 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
998 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
999 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1000 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1001
1002 _mm_store_si128((__m128i *)dst, r0);
1003 _mm_store_si128((__m128i *)(dst + 16), r1);
1004 _mm_store_si128((__m128i *)(dst + 32), r2);
1005 _mm_store_si128((__m128i *)(dst + 48), r3);
1006
1007 dst += stride;
1008 rep = _mm256_add_epi16(rep, one);
1009 }
1010 }
1011 }
1012
aom_paeth_predictor_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1013 void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
1014 const uint8_t *above, const uint8_t *left) {
1015 const __m256i t0 = get_top_vector(above);
1016 const __m256i t1 = get_top_vector(above + 16);
1017 const __m256i t2 = get_top_vector(above + 32);
1018 const __m256i t3 = get_top_vector(above + 48);
1019 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
1020 const __m256i one = _mm256_set1_epi16(1);
1021
1022 int i;
1023 const __m256i l = get_left_vector(left);
1024 __m256i rep = _mm256_set1_epi16((short)0x8000);
1025 for (i = 0; i < 16; ++i) {
1026 const __m256i l16 = _mm256_shuffle_epi8(l, rep);
1027
1028 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
1029 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
1030 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
1031 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
1032
1033 _mm_store_si128((__m128i *)dst, r0);
1034 _mm_store_si128((__m128i *)(dst + 16), r1);
1035 _mm_store_si128((__m128i *)(dst + 32), r2);
1036 _mm_store_si128((__m128i *)(dst + 48), r3);
1037
1038 dst += stride;
1039 rep = _mm256_add_epi16(rep, one);
1040 }
1041 }
1042
1043 #define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
1044 #define PERM2x128(c0, c1) c0 + (c1 << 4)
1045
highbd_dr_prediction_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1046 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
1047 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1048 const int frac_bits = 6 - upsample_above;
1049 const int max_base_x = ((N + 4) - 1) << upsample_above;
1050
1051 assert(dx > 0);
1052 // pre-filter above pixels
1053 // store in temp buffers:
1054 // above[x] * 32 + 16
1055 // above[x+1] - above[x]
1056 // final pixels will be calculated as:
1057 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1058 __m256i a0, a1, a32, a16;
1059 __m256i diff, c3f;
1060 __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1061 __m128i a0_128, a1_128;
1062 a16 = _mm256_set1_epi16(16);
1063 a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1064 max_base_x128 = _mm_set1_epi16(max_base_x);
1065 c3f = _mm256_set1_epi16(0x3f);
1066
1067 int x = dx;
1068 for (int r = 0; r < N; r++) {
1069 __m256i b, res, shift;
1070 __m128i res1;
1071
1072 int base = x >> frac_bits;
1073 if (base >= max_base_x) {
1074 for (int i = r; i < N; ++i) {
1075 dst[i] = a_mbase_x; // save 4 values
1076 }
1077 return;
1078 }
1079
1080 a0_128 = _mm_loadu_si128((__m128i *)(above + base));
1081 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1082
1083 if (upsample_above) {
1084 a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
1085 a1_128 = _mm_srli_si128(a0_128, 8);
1086
1087 base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
1088 base + 10, base + 12, base + 14);
1089 shift = _mm256_srli_epi16(
1090 _mm256_and_si256(
1091 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
1092 _mm256_set1_epi16(0x3f)),
1093 1);
1094 } else {
1095 base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
1096 base + 5, base + 6, base + 7);
1097 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1098 }
1099 a0 = _mm256_castsi128_si256(a0_128);
1100 a1 = _mm256_castsi128_si256(a1_128);
1101 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1102 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1103 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1104
1105 b = _mm256_mullo_epi16(diff, shift);
1106 res = _mm256_add_epi16(a32, b);
1107 res = _mm256_srli_epi16(res, 5);
1108 res1 = _mm256_castsi256_si128(res);
1109
1110 mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
1111 dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1112 x += dx;
1113 }
1114 }
1115
highbd_dr_prediction_32bit_z1_4xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1116 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
1117 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1118 const int frac_bits = 6 - upsample_above;
1119 const int max_base_x = ((N + 4) - 1) << upsample_above;
1120
1121 assert(dx > 0);
1122 // pre-filter above pixels
1123 // store in temp buffers:
1124 // above[x] * 32 + 16
1125 // above[x+1] - above[x]
1126 // final pixels will be calculated as:
1127 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1128 __m256i a0, a1, a32, a16;
1129 __m256i diff;
1130 __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
1131
1132 a16 = _mm256_set1_epi32(16);
1133 a_mbase_x = _mm_set1_epi16(above[max_base_x]);
1134 max_base_x128 = _mm_set1_epi32(max_base_x);
1135
1136 int x = dx;
1137 for (int r = 0; r < N; r++) {
1138 __m256i b, res, shift;
1139 __m128i res1;
1140
1141 int base = x >> frac_bits;
1142 if (base >= max_base_x) {
1143 for (int i = r; i < N; ++i) {
1144 dst[i] = a_mbase_x; // save 4 values
1145 }
1146 return;
1147 }
1148
1149 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1150 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1151
1152 if (upsample_above) {
1153 a0 = _mm256_permutevar8x32_epi32(
1154 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1155 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1156 base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
1157 shift = _mm256_srli_epi32(
1158 _mm256_and_si256(
1159 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1160 _mm256_set1_epi32(0x3f)),
1161 1);
1162 } else {
1163 base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
1164 shift = _mm256_srli_epi32(
1165 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1166 }
1167
1168 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1169 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1170 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1171
1172 b = _mm256_mullo_epi32(diff, shift);
1173 res = _mm256_add_epi32(a32, b);
1174 res = _mm256_srli_epi32(res, 5);
1175
1176 res1 = _mm256_castsi256_si128(res);
1177 res1 = _mm_packus_epi32(res1, res1);
1178
1179 mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
1180 mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit
1181 dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
1182 x += dx;
1183 }
1184 }
1185
highbd_dr_prediction_z1_4xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1186 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
1187 ptrdiff_t stride,
1188 const uint16_t *above,
1189 int upsample_above, int dx,
1190 int bd) {
1191 __m128i dstvec[16];
1192 if (bd < 12) {
1193 highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
1194 dx);
1195 } else {
1196 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
1197 upsample_above, dx);
1198 }
1199 for (int i = 0; i < N; i++) {
1200 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
1201 }
1202 }
1203
highbd_dr_prediction_32bit_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1204 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
1205 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1206 const int frac_bits = 6 - upsample_above;
1207 const int max_base_x = ((8 + N) - 1) << upsample_above;
1208
1209 assert(dx > 0);
1210 // pre-filter above pixels
1211 // store in temp buffers:
1212 // above[x] * 32 + 16
1213 // above[x+1] - above[x]
1214 // final pixels will be calculated as:
1215 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1216 __m256i a0, a1, a0_1, a1_1, a32, a16;
1217 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1218
1219 a16 = _mm256_set1_epi32(16);
1220 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1221 max_base_x256 = _mm256_set1_epi32(max_base_x);
1222
1223 int x = dx;
1224 for (int r = 0; r < N; r++) {
1225 __m256i b, res, res1, shift;
1226
1227 int base = x >> frac_bits;
1228 if (base >= max_base_x) {
1229 for (int i = r; i < N; ++i) {
1230 dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
1231 }
1232 return;
1233 }
1234
1235 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1236 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1237
1238 if (upsample_above) {
1239 a0 = _mm256_permutevar8x32_epi32(
1240 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1241 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
1242
1243 a0_1 =
1244 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1245 a0_1 = _mm256_permutevar8x32_epi32(
1246 a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
1247 a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
1248
1249 a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
1250 a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
1251 base_inc256 =
1252 _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
1253 base + 10, base + 12, base + 14);
1254 shift = _mm256_srli_epi32(
1255 _mm256_and_si256(
1256 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
1257 _mm256_set1_epi32(0x3f)),
1258 1);
1259 } else {
1260 base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
1261 base + 4, base + 5, base + 6, base + 7);
1262 shift = _mm256_srli_epi32(
1263 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1264 }
1265
1266 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1267 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1268 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1269
1270 b = _mm256_mullo_epi32(diff, shift);
1271 res = _mm256_add_epi32(a32, b);
1272 res = _mm256_srli_epi32(res, 5);
1273
1274 res1 = _mm256_packus_epi32(
1275 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
1276
1277 mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
1278 mask256 = _mm256_packs_epi32(
1279 mask256, _mm256_castsi128_si256(
1280 _mm256_extracti128_si256(mask256, 1))); // goto 16 bit
1281 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1282 dst[r] = _mm256_castsi256_si128(res1);
1283 x += dx;
1284 }
1285 }
1286
highbd_dr_prediction_z1_8xN_internal_avx2(int N,__m128i * dst,const uint16_t * above,int upsample_above,int dx)1287 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
1288 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
1289 const int frac_bits = 6 - upsample_above;
1290 const int max_base_x = ((8 + N) - 1) << upsample_above;
1291
1292 assert(dx > 0);
1293 // pre-filter above pixels
1294 // store in temp buffers:
1295 // above[x] * 32 + 16
1296 // above[x+1] - above[x]
1297 // final pixels will be calculated as:
1298 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1299 __m256i a0, a1, a32, a16, c3f;
1300 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1301 __m128i a0_x128, a1_x128;
1302
1303 a16 = _mm256_set1_epi16(16);
1304 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1305 max_base_x256 = _mm256_set1_epi16(max_base_x);
1306 c3f = _mm256_set1_epi16(0x3f);
1307
1308 int x = dx;
1309 for (int r = 0; r < N; r++) {
1310 __m256i b, res, res1, shift;
1311
1312 int base = x >> frac_bits;
1313 if (base >= max_base_x) {
1314 for (int i = r; i < N; ++i) {
1315 dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
1316 }
1317 return;
1318 }
1319
1320 a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
1321 if (upsample_above) {
1322 __m128i mask, atmp0, atmp1, atmp2, atmp3;
1323 a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
1324 atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1325 atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
1326 atmp2 =
1327 _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1328 atmp3 =
1329 _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
1330 mask =
1331 _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
1332 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
1333 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
1334 _mm_set1_epi8(15));
1335 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
1336
1337 base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
1338 base + 8, base + 10, base + 12, base + 14,
1339 0, 0, 0, 0, 0, 0, 0, 0);
1340 shift = _mm256_srli_epi16(
1341 _mm256_and_si256(
1342 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
1343 1);
1344 } else {
1345 a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
1346 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1347 base + 4, base + 5, base + 6, base + 7, 0,
1348 0, 0, 0, 0, 0, 0, 0);
1349 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1350 }
1351 a0 = _mm256_castsi128_si256(a0_x128);
1352 a1 = _mm256_castsi128_si256(a1_x128);
1353
1354 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1355 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1356 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1357
1358 b = _mm256_mullo_epi16(diff, shift);
1359 res = _mm256_add_epi16(a32, b);
1360 res = _mm256_srli_epi16(res, 5);
1361
1362 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1363 res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1364 dst[r] = _mm256_castsi256_si128(res1);
1365 x += dx;
1366 }
1367 }
1368
highbd_dr_prediction_z1_8xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1369 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
1370 ptrdiff_t stride,
1371 const uint16_t *above,
1372 int upsample_above, int dx,
1373 int bd) {
1374 __m128i dstvec[32];
1375 if (bd < 12) {
1376 highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
1377 dx);
1378 } else {
1379 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
1380 upsample_above, dx);
1381 }
1382 for (int i = 0; i < N; i++) {
1383 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
1384 }
1385 }
1386
highbd_dr_prediction_32bit_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1387 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
1388 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1389 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1390 (void)upsample_above;
1391 const int frac_bits = 6;
1392 const int max_base_x = ((16 + N) - 1);
1393
1394 // pre-filter above pixels
1395 // store in temp buffers:
1396 // above[x] * 32 + 16
1397 // above[x+1] - above[x]
1398 // final pixels will be calculated as:
1399 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1400 __m256i a0, a0_1, a1, a1_1, a32, a16;
1401 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1402
1403 a16 = _mm256_set1_epi32(16);
1404 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1405 max_base_x256 = _mm256_set1_epi16(max_base_x);
1406
1407 int x = dx;
1408 for (int r = 0; r < N; r++) {
1409 __m256i b, res[2], res1;
1410
1411 int base = x >> frac_bits;
1412 if (base >= max_base_x) {
1413 for (int i = r; i < N; ++i) {
1414 dstvec[i] = a_mbase_x; // save 16 values
1415 }
1416 return;
1417 }
1418 __m256i shift = _mm256_srli_epi32(
1419 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1420
1421 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
1422 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
1423
1424 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1425 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1426 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1427 b = _mm256_mullo_epi32(diff, shift);
1428
1429 res[0] = _mm256_add_epi32(a32, b);
1430 res[0] = _mm256_srli_epi32(res[0], 5);
1431 res[0] = _mm256_packus_epi32(
1432 res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1433
1434 int mdif = max_base_x - base;
1435 if (mdif > 8) {
1436 a0_1 =
1437 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
1438 a1_1 =
1439 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
1440
1441 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1442 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1443 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1444 b = _mm256_mullo_epi32(diff, shift);
1445
1446 res[1] = _mm256_add_epi32(a32, b);
1447 res[1] = _mm256_srli_epi32(res[1], 5);
1448 res[1] = _mm256_packus_epi32(
1449 res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1450 } else {
1451 res[1] = a_mbase_x;
1452 }
1453 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1454 1); // 16 16bit values
1455
1456 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1457 base + 4, base + 5, base + 6, base + 7,
1458 base + 8, base + 9, base + 10, base + 11,
1459 base + 12, base + 13, base + 14, base + 15);
1460 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1461 dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1462 x += dx;
1463 }
1464 }
1465
highbd_dr_prediction_z1_16xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1466 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
1467 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1468 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1469 (void)upsample_above;
1470 const int frac_bits = 6;
1471 const int max_base_x = ((16 + N) - 1);
1472
1473 // pre-filter above pixels
1474 // store in temp buffers:
1475 // above[x] * 32 + 16
1476 // above[x+1] - above[x]
1477 // final pixels will be calculated as:
1478 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1479 __m256i a0, a1, a32, a16, c3f;
1480 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1481
1482 a16 = _mm256_set1_epi16(16);
1483 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1484 max_base_x256 = _mm256_set1_epi16(max_base_x);
1485 c3f = _mm256_set1_epi16(0x3f);
1486
1487 int x = dx;
1488 for (int r = 0; r < N; r++) {
1489 __m256i b, res;
1490
1491 int base = x >> frac_bits;
1492 if (base >= max_base_x) {
1493 for (int i = r; i < N; ++i) {
1494 dstvec[i] = a_mbase_x; // save 16 values
1495 }
1496 return;
1497 }
1498 __m256i shift =
1499 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1500
1501 a0 = _mm256_loadu_si256((__m256i *)(above + base));
1502 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
1503
1504 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1505 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1506 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1507 b = _mm256_mullo_epi16(diff, shift);
1508
1509 res = _mm256_add_epi16(a32, b);
1510 res = _mm256_srli_epi16(res, 5); // 16 16bit values
1511
1512 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
1513 base + 4, base + 5, base + 6, base + 7,
1514 base + 8, base + 9, base + 10, base + 11,
1515 base + 12, base + 13, base + 14, base + 15);
1516 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1517 dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1518 x += dx;
1519 }
1520 }
1521
highbd_dr_prediction_z1_16xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1522 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
1523 ptrdiff_t stride,
1524 const uint16_t *above,
1525 int upsample_above, int dx,
1526 int bd) {
1527 __m256i dstvec[64];
1528 if (bd < 12) {
1529 highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
1530 dx);
1531 } else {
1532 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
1533 upsample_above, dx);
1534 }
1535 for (int i = 0; i < N; i++) {
1536 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1537 }
1538 }
1539
highbd_dr_prediction_32bit_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1540 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
1541 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1542 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1543 (void)upsample_above;
1544 const int frac_bits = 6;
1545 const int max_base_x = ((32 + N) - 1);
1546
1547 // pre-filter above pixels
1548 // store in temp buffers:
1549 // above[x] * 32 + 16
1550 // above[x+1] - above[x]
1551 // final pixels will be calculated as:
1552 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1553 __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
1554 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1555
1556 a16 = _mm256_set1_epi32(16);
1557 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1558 max_base_x256 = _mm256_set1_epi16(max_base_x);
1559 c3f = _mm256_set1_epi16(0x3f);
1560
1561 int x = dx;
1562 for (int r = 0; r < N; r++) {
1563 __m256i b, res[2], res1;
1564
1565 int base = x >> frac_bits;
1566 if (base >= max_base_x) {
1567 for (int i = r; i < N; ++i) {
1568 dstvec[i] = a_mbase_x; // save 32 values
1569 dstvec[i + N] = a_mbase_x;
1570 }
1571 return;
1572 }
1573
1574 __m256i shift =
1575 _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
1576
1577 for (int j = 0; j < 32; j += 16) {
1578 int mdif = max_base_x - (base + j);
1579 if (mdif <= 0) {
1580 res1 = a_mbase_x;
1581 } else {
1582 a0 = _mm256_cvtepu16_epi32(
1583 _mm_loadu_si128((__m128i *)(above + base + j)));
1584 a1 = _mm256_cvtepu16_epi32(
1585 _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
1586
1587 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1588 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1589 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1590 b = _mm256_mullo_epi32(diff, shift);
1591
1592 res[0] = _mm256_add_epi32(a32, b);
1593 res[0] = _mm256_srli_epi32(res[0], 5);
1594 res[0] = _mm256_packus_epi32(
1595 res[0],
1596 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1597 if (mdif > 8) {
1598 a0_1 = _mm256_cvtepu16_epi32(
1599 _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
1600 a1_1 = _mm256_cvtepu16_epi32(
1601 _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
1602
1603 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1604 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1605 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1606 b = _mm256_mullo_epi32(diff, shift);
1607
1608 res[1] = _mm256_add_epi32(a32, b);
1609 res[1] = _mm256_srli_epi32(res[1], 5);
1610 res[1] = _mm256_packus_epi32(
1611 res[1],
1612 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1613 } else {
1614 res[1] = a_mbase_x;
1615 }
1616 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1617 1); // 16 16bit values
1618 base_inc256 = _mm256_setr_epi16(
1619 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1620 base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1621 base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1622 base + j + 13, base + j + 14, base + j + 15);
1623
1624 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1625 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1626 }
1627 if (!j) {
1628 dstvec[r] = res1;
1629 } else {
1630 dstvec[r + N] = res1;
1631 }
1632 }
1633 x += dx;
1634 }
1635 }
1636
highbd_dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint16_t * above,int upsample_above,int dx)1637 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
1638 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
1639 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1640 (void)upsample_above;
1641 const int frac_bits = 6;
1642 const int max_base_x = ((32 + N) - 1);
1643
1644 // pre-filter above pixels
1645 // store in temp buffers:
1646 // above[x] * 32 + 16
1647 // above[x+1] - above[x]
1648 // final pixels will be calculated as:
1649 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1650 __m256i a0, a1, a32, a16, c3f;
1651 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1652
1653 a16 = _mm256_set1_epi16(16);
1654 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1655 max_base_x256 = _mm256_set1_epi16(max_base_x);
1656 c3f = _mm256_set1_epi16(0x3f);
1657
1658 int x = dx;
1659 for (int r = 0; r < N; r++) {
1660 __m256i b, res;
1661
1662 int base = x >> frac_bits;
1663 if (base >= max_base_x) {
1664 for (int i = r; i < N; ++i) {
1665 dstvec[i] = a_mbase_x; // save 32 values
1666 dstvec[i + N] = a_mbase_x;
1667 }
1668 return;
1669 }
1670
1671 __m256i shift =
1672 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1673
1674 for (int j = 0; j < 32; j += 16) {
1675 int mdif = max_base_x - (base + j);
1676 if (mdif <= 0) {
1677 res = a_mbase_x;
1678 } else {
1679 a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1680 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1681
1682 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1683 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1684 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1685 b = _mm256_mullo_epi16(diff, shift);
1686
1687 res = _mm256_add_epi16(a32, b);
1688 res = _mm256_srli_epi16(res, 5);
1689
1690 base_inc256 = _mm256_setr_epi16(
1691 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1692 base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1693 base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1694 base + j + 13, base + j + 14, base + j + 15);
1695
1696 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1697 res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1698 }
1699 if (!j) {
1700 dstvec[r] = res;
1701 } else {
1702 dstvec[r + N] = res;
1703 }
1704 }
1705 x += dx;
1706 }
1707 }
1708
highbd_dr_prediction_z1_32xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx,int bd)1709 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
1710 ptrdiff_t stride,
1711 const uint16_t *above,
1712 int upsample_above, int dx,
1713 int bd) {
1714 __m256i dstvec[128];
1715 if (bd < 12) {
1716 highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
1717 dx);
1718 } else {
1719 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
1720 upsample_above, dx);
1721 }
1722 for (int i = 0; i < N; i++) {
1723 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
1724 _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
1725 }
1726 }
1727
highbd_dr_prediction_32bit_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1728 static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
1729 ptrdiff_t stride,
1730 const uint16_t *above,
1731 int upsample_above,
1732 int dx) {
1733 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1734 (void)upsample_above;
1735 const int frac_bits = 6;
1736 const int max_base_x = ((64 + N) - 1);
1737
1738 // pre-filter above pixels
1739 // store in temp buffers:
1740 // above[x] * 32 + 16
1741 // above[x+1] - above[x]
1742 // final pixels will be calculated as:
1743 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1744 __m256i a0, a0_1, a1, a1_1, a32, a16;
1745 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1746
1747 a16 = _mm256_set1_epi32(16);
1748 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1749 max_base_x256 = _mm256_set1_epi16(max_base_x);
1750
1751 int x = dx;
1752 for (int r = 0; r < N; r++, dst += stride) {
1753 __m256i b, res[2], res1;
1754
1755 int base = x >> frac_bits;
1756 if (base >= max_base_x) {
1757 for (int i = r; i < N; ++i) {
1758 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
1759 _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1760 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1761 _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1762 dst += stride;
1763 }
1764 return;
1765 }
1766
1767 __m256i shift = _mm256_srli_epi32(
1768 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
1769
1770 __m128i a0_128, a0_1_128, a1_128, a1_1_128;
1771 for (int j = 0; j < 64; j += 16) {
1772 int mdif = max_base_x - (base + j);
1773 if (mdif <= 0) {
1774 _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1775 } else {
1776 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
1777 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
1778 a0 = _mm256_cvtepu16_epi32(a0_128);
1779 a1 = _mm256_cvtepu16_epi32(a1_128);
1780
1781 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
1782 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
1783 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1784 b = _mm256_mullo_epi32(diff, shift);
1785
1786 res[0] = _mm256_add_epi32(a32, b);
1787 res[0] = _mm256_srli_epi32(res[0], 5);
1788 res[0] = _mm256_packus_epi32(
1789 res[0],
1790 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
1791 if (mdif > 8) {
1792 a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
1793 a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
1794 a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
1795 a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
1796
1797 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
1798 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
1799 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
1800 b = _mm256_mullo_epi32(diff, shift);
1801
1802 res[1] = _mm256_add_epi32(a32, b);
1803 res[1] = _mm256_srli_epi32(res[1], 5);
1804 res[1] = _mm256_packus_epi32(
1805 res[1],
1806 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
1807 } else {
1808 res[1] = a_mbase_x;
1809 }
1810 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
1811 1); // 16 16bit values
1812 base_inc256 = _mm256_setr_epi16(
1813 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1814 base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1815 base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1816 base + j + 13, base + j + 14, base + j + 15);
1817
1818 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1819 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
1820 _mm256_storeu_si256((__m256i *)(dst + j), res1);
1821 }
1822 }
1823 x += dx;
1824 }
1825 }
1826
highbd_dr_prediction_z1_64xN_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,int upsample_above,int dx)1827 static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
1828 ptrdiff_t stride,
1829 const uint16_t *above,
1830 int upsample_above, int dx) {
1831 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
1832 (void)upsample_above;
1833 const int frac_bits = 6;
1834 const int max_base_x = ((64 + N) - 1);
1835
1836 // pre-filter above pixels
1837 // store in temp buffers:
1838 // above[x] * 32 + 16
1839 // above[x+1] - above[x]
1840 // final pixels will be calculated as:
1841 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1842 __m256i a0, a1, a32, a16, c3f;
1843 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
1844
1845 a16 = _mm256_set1_epi16(16);
1846 a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
1847 max_base_x256 = _mm256_set1_epi16(max_base_x);
1848 c3f = _mm256_set1_epi16(0x3f);
1849
1850 int x = dx;
1851 for (int r = 0; r < N; r++, dst += stride) {
1852 __m256i b, res;
1853
1854 int base = x >> frac_bits;
1855 if (base >= max_base_x) {
1856 for (int i = r; i < N; ++i) {
1857 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
1858 _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
1859 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
1860 _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
1861 dst += stride;
1862 }
1863 return;
1864 }
1865
1866 __m256i shift =
1867 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
1868
1869 for (int j = 0; j < 64; j += 16) {
1870 int mdif = max_base_x - (base + j);
1871 if (mdif <= 0) {
1872 _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
1873 } else {
1874 a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
1875 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
1876
1877 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
1878 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
1879 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
1880 b = _mm256_mullo_epi16(diff, shift);
1881
1882 res = _mm256_add_epi16(a32, b);
1883 res = _mm256_srli_epi16(res, 5);
1884
1885 base_inc256 = _mm256_setr_epi16(
1886 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
1887 base + j + 5, base + j + 6, base + j + 7, base + j + 8,
1888 base + j + 9, base + j + 10, base + j + 11, base + j + 12,
1889 base + j + 13, base + j + 14, base + j + 15);
1890
1891 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
1892 res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
1893 _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values
1894 }
1895 }
1896 x += dx;
1897 }
1898 }
1899
1900 // Directional prediction, zone 1: 0 < angle < 90
av1_highbd_dr_prediction_z1_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int dx,int dy,int bd)1901 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
1902 int bh, const uint16_t *above,
1903 const uint16_t *left, int upsample_above,
1904 int dx, int dy, int bd) {
1905 (void)left;
1906 (void)dy;
1907
1908 switch (bw) {
1909 case 4:
1910 highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
1911 dx, bd);
1912 break;
1913 case 8:
1914 highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
1915 dx, bd);
1916 break;
1917 case 16:
1918 highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
1919 dx, bd);
1920 break;
1921 case 32:
1922 highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
1923 dx, bd);
1924 break;
1925 case 64:
1926 if (bd < 12) {
1927 highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
1928 upsample_above, dx);
1929 } else {
1930 highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
1931 upsample_above, dx);
1932 }
1933 break;
1934 default: break;
1935 }
1936 return;
1937 }
1938
highbd_transpose_TX_16X16(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst)1939 static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
1940 uint16_t *dst, ptrdiff_t pitchDst) {
1941 __m256i r[16];
1942 __m256i d[16];
1943 for (int j = 0; j < 16; j++) {
1944 r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
1945 }
1946 highbd_transpose16x16_avx2(r, d);
1947 for (int j = 0; j < 16; j++) {
1948 _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
1949 }
1950 }
1951
highbd_transpose(const uint16_t * src,ptrdiff_t pitchSrc,uint16_t * dst,ptrdiff_t pitchDst,int width,int height)1952 static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
1953 uint16_t *dst, ptrdiff_t pitchDst, int width,
1954 int height) {
1955 for (int j = 0; j < height; j += 16)
1956 for (int i = 0; i < width; i += 16)
1957 highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
1958 dst + j * pitchDst + i, pitchDst);
1959 }
1960
highbd_dr_prediction_32bit_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)1961 static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
1962 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
1963 const uint16_t *left, int upsample_above, int upsample_left, int dx,
1964 int dy) {
1965 const int min_base_x = -(1 << upsample_above);
1966 const int min_base_y = -(1 << upsample_left);
1967 const int frac_bits_x = 6 - upsample_above;
1968 const int frac_bits_y = 6 - upsample_left;
1969
1970 assert(dx > 0);
1971 // pre-filter above pixels
1972 // store in temp buffers:
1973 // above[x] * 32 + 16
1974 // above[x+1] - above[x]
1975 // final pixels will be calculated as:
1976 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
1977 __m256i a0_x, a1_x, a32, a16;
1978 __m256i diff;
1979 __m128i c3f, min_base_y128;
1980
1981 a16 = _mm256_set1_epi32(16);
1982 c3f = _mm_set1_epi32(0x3f);
1983 min_base_y128 = _mm_set1_epi32(min_base_y);
1984
1985 for (int r = 0; r < N; r++) {
1986 __m256i b, res, shift;
1987 __m128i resx, resy, resxy;
1988 __m128i a0_x128, a1_x128;
1989 int y = r + 1;
1990 int base_x = (-y * dx) >> frac_bits_x;
1991 int base_shift = 0;
1992 if (base_x < (min_base_x - 1)) {
1993 base_shift = (min_base_x - base_x - 1) >> upsample_above;
1994 }
1995 int base_min_diff =
1996 (min_base_x - base_x + upsample_above) >> upsample_above;
1997 if (base_min_diff > 4) {
1998 base_min_diff = 4;
1999 } else {
2000 if (base_min_diff < 0) base_min_diff = 0;
2001 }
2002
2003 if (base_shift > 3) {
2004 a0_x = _mm256_setzero_si256();
2005 a1_x = _mm256_setzero_si256();
2006 shift = _mm256_setzero_si256();
2007 } else {
2008 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2009 if (upsample_above) {
2010 a0_x128 = _mm_shuffle_epi8(a0_x128,
2011 *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2012 a1_x128 = _mm_srli_si128(a0_x128, 8);
2013
2014 shift = _mm256_castsi128_si256(_mm_srli_epi32(
2015 _mm_and_si128(
2016 _mm_slli_epi32(
2017 _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2018 (2 << 6) - y * dx, (3 << 6) - y * dx),
2019 upsample_above),
2020 c3f),
2021 1));
2022 } else {
2023 a0_x128 =
2024 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2025 a1_x128 = _mm_srli_si128(a0_x128, 2);
2026
2027 shift = _mm256_castsi128_si256(_mm_srli_epi32(
2028 _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
2029 (2 << 6) - y * dx, (3 << 6) - y * dx),
2030 c3f),
2031 1));
2032 }
2033 a0_x = _mm256_cvtepu16_epi32(a0_x128);
2034 a1_x = _mm256_cvtepu16_epi32(a1_x128);
2035 }
2036 // y calc
2037 __m128i a0_y, a1_y, shifty;
2038 if (base_x < min_base_x) {
2039 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2040 DECLARE_ALIGNED(32, int, base_y_c[4]);
2041 r6 = _mm_set1_epi32(r << 6);
2042 dy128 = _mm_set1_epi32(dy);
2043 c1234 = _mm_setr_epi32(1, 2, 3, 4);
2044 y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
2045 base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
2046 mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
2047 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2048 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2049
2050 a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
2051 left[base_y_c[2]], left[base_y_c[3]]);
2052 a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2053 left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
2054
2055 if (upsample_left) {
2056 shifty = _mm_srli_epi32(
2057 _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
2058 } else {
2059 shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
2060 }
2061 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2062 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2063 shift = _mm256_inserti128_si256(shift, shifty, 1);
2064 }
2065
2066 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
2067 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
2068 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2069
2070 b = _mm256_mullo_epi32(diff, shift);
2071 res = _mm256_add_epi32(a32, b);
2072 res = _mm256_srli_epi32(res, 5);
2073
2074 resx = _mm256_castsi256_si128(res);
2075 resx = _mm_packus_epi32(resx, resx);
2076
2077 resy = _mm256_extracti128_si256(res, 1);
2078 resy = _mm_packus_epi32(resy, resy);
2079
2080 resxy =
2081 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2082 _mm_storel_epi64((__m128i *)(dst), resxy);
2083 dst += stride;
2084 }
2085 }
2086
highbd_dr_prediction_z2_Nx4_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2087 static void highbd_dr_prediction_z2_Nx4_avx2(
2088 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2089 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2090 int dy) {
2091 const int min_base_x = -(1 << upsample_above);
2092 const int min_base_y = -(1 << upsample_left);
2093 const int frac_bits_x = 6 - upsample_above;
2094 const int frac_bits_y = 6 - upsample_left;
2095
2096 assert(dx > 0);
2097 // pre-filter above pixels
2098 // store in temp buffers:
2099 // above[x] * 32 + 16
2100 // above[x+1] - above[x]
2101 // final pixels will be calculated as:
2102 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2103 __m256i a0_x, a1_x, a32, a16;
2104 __m256i diff;
2105 __m128i c3f, min_base_y128;
2106
2107 a16 = _mm256_set1_epi16(16);
2108 c3f = _mm_set1_epi16(0x3f);
2109 min_base_y128 = _mm_set1_epi16(min_base_y);
2110
2111 for (int r = 0; r < N; r++) {
2112 __m256i b, res, shift;
2113 __m128i resx, resy, resxy;
2114 __m128i a0_x128, a1_x128;
2115 int y = r + 1;
2116 int base_x = (-y * dx) >> frac_bits_x;
2117 int base_shift = 0;
2118 if (base_x < (min_base_x - 1)) {
2119 base_shift = (min_base_x - base_x - 1) >> upsample_above;
2120 }
2121 int base_min_diff =
2122 (min_base_x - base_x + upsample_above) >> upsample_above;
2123 if (base_min_diff > 4) {
2124 base_min_diff = 4;
2125 } else {
2126 if (base_min_diff < 0) base_min_diff = 0;
2127 }
2128
2129 if (base_shift > 3) {
2130 a0_x = _mm256_setzero_si256();
2131 a1_x = _mm256_setzero_si256();
2132 shift = _mm256_setzero_si256();
2133 } else {
2134 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2135 if (upsample_above) {
2136 a0_x128 = _mm_shuffle_epi8(a0_x128,
2137 *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
2138 a1_x128 = _mm_srli_si128(a0_x128, 8);
2139
2140 shift = _mm256_castsi128_si256(_mm_srli_epi16(
2141 _mm_and_si128(
2142 _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2143 (2 << 6) - y * dx,
2144 (3 << 6) - y * dx, 0, 0, 0, 0),
2145 upsample_above),
2146 c3f),
2147 1));
2148 } else {
2149 a0_x128 =
2150 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2151 a1_x128 = _mm_srli_si128(a0_x128, 2);
2152
2153 shift = _mm256_castsi128_si256(_mm_srli_epi16(
2154 _mm_and_si128(
2155 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2156 (3 << 6) - y * dx, 0, 0, 0, 0),
2157 c3f),
2158 1));
2159 }
2160 a0_x = _mm256_castsi128_si256(a0_x128);
2161 a1_x = _mm256_castsi128_si256(a1_x128);
2162 }
2163 // y calc
2164 __m128i a0_y, a1_y, shifty;
2165 if (base_x < min_base_x) {
2166 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2167 DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2168 r6 = _mm_set1_epi16(r << 6);
2169 dy128 = _mm_set1_epi16(dy);
2170 c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
2171 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2172 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2173 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2174 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2175 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2176
2177 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2178 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
2179 a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2180 left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
2181 0, 0);
2182
2183 if (upsample_left) {
2184 shifty = _mm_srli_epi16(
2185 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
2186 } else {
2187 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2188 }
2189 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2190 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2191 shift = _mm256_inserti128_si256(shift, shifty, 1);
2192 }
2193
2194 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2195 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
2196 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2197
2198 b = _mm256_mullo_epi16(diff, shift);
2199 res = _mm256_add_epi16(a32, b);
2200 res = _mm256_srli_epi16(res, 5);
2201
2202 resx = _mm256_castsi256_si128(res);
2203 resy = _mm256_extracti128_si256(res, 1);
2204 resxy =
2205 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2206 _mm_storel_epi64((__m128i *)(dst), resxy);
2207 dst += stride;
2208 }
2209 }
2210
highbd_dr_prediction_32bit_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2211 static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
2212 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2213 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2214 int dy) {
2215 const int min_base_x = -(1 << upsample_above);
2216 const int min_base_y = -(1 << upsample_left);
2217 const int frac_bits_x = 6 - upsample_above;
2218 const int frac_bits_y = 6 - upsample_left;
2219
2220 // pre-filter above pixels
2221 // store in temp buffers:
2222 // above[x] * 32 + 16
2223 // above[x+1] - above[x]
2224 // final pixels will be calculated as:
2225 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2226 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
2227 __m256i diff;
2228 __m128i a0_x128, a1_x128;
2229
2230 a16 = _mm256_set1_epi32(16);
2231 c3f = _mm256_set1_epi32(0x3f);
2232 min_base_y256 = _mm256_set1_epi32(min_base_y);
2233
2234 for (int r = 0; r < N; r++) {
2235 __m256i b, res, shift;
2236 __m128i resx, resy, resxy;
2237 int y = r + 1;
2238 int base_x = (-y * dx) >> frac_bits_x;
2239 int base_shift = 0;
2240 if (base_x < (min_base_x - 1)) {
2241 base_shift = (min_base_x - base_x - 1) >> upsample_above;
2242 }
2243 int base_min_diff =
2244 (min_base_x - base_x + upsample_above) >> upsample_above;
2245 if (base_min_diff > 8) {
2246 base_min_diff = 8;
2247 } else {
2248 if (base_min_diff < 0) base_min_diff = 0;
2249 }
2250
2251 if (base_shift > 7) {
2252 resx = _mm_setzero_si128();
2253 } else {
2254 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2255 if (upsample_above) {
2256 __m128i mask, atmp0, atmp1, atmp2, atmp3;
2257 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2258 atmp0 = _mm_shuffle_epi8(a0_x128,
2259 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2260 atmp1 = _mm_shuffle_epi8(a1_x128,
2261 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2262 atmp2 = _mm_shuffle_epi8(
2263 a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2264 atmp3 = _mm_shuffle_epi8(
2265 a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2266 mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2267 _mm_set1_epi8(15));
2268 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2269 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2270 _mm_set1_epi8(15));
2271 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2272 shift = _mm256_srli_epi32(
2273 _mm256_and_si256(
2274 _mm256_slli_epi32(
2275 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
2276 (2 << 6) - y * dx, (3 << 6) - y * dx,
2277 (4 << 6) - y * dx, (5 << 6) - y * dx,
2278 (6 << 6) - y * dx, (7 << 6) - y * dx),
2279 upsample_above),
2280 c3f),
2281 1);
2282 } else {
2283 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2284 a0_x128 =
2285 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2286 a1_x128 =
2287 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2288
2289 shift = _mm256_srli_epi32(
2290 _mm256_and_si256(
2291 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
2292 (3 << 6) - y * dx, (4 << 6) - y * dx,
2293 (5 << 6) - y * dx, (6 << 6) - y * dx,
2294 (7 << 6) - y * dx),
2295 c3f),
2296 1);
2297 }
2298 a0_x = _mm256_cvtepu16_epi32(a0_x128);
2299 a1_x = _mm256_cvtepu16_epi32(a1_x128);
2300
2301 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
2302 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
2303 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2304
2305 b = _mm256_mullo_epi32(diff, shift);
2306 res = _mm256_add_epi32(a32, b);
2307 res = _mm256_srli_epi32(res, 5);
2308
2309 resx = _mm256_castsi256_si128(_mm256_packus_epi32(
2310 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2311 }
2312 // y calc
2313 if (base_x < min_base_x) {
2314 DECLARE_ALIGNED(32, int, base_y_c[8]);
2315 __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
2316 r6 = _mm256_set1_epi32(r << 6);
2317 dy256 = _mm256_set1_epi32(dy);
2318 c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
2319 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2320 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2321 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2322 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2323 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2324
2325 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2326 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2327 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2328 left[base_y_c[6]], left[base_y_c[7]]));
2329 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2330 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2331 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2332 left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2333
2334 if (upsample_left) {
2335 shift = _mm256_srli_epi32(
2336 _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
2337 1);
2338 } else {
2339 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2340 }
2341 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
2342 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
2343 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2344
2345 b = _mm256_mullo_epi32(diff, shift);
2346 res = _mm256_add_epi32(a32, b);
2347 res = _mm256_srli_epi32(res, 5);
2348
2349 resy = _mm256_castsi256_si128(_mm256_packus_epi32(
2350 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
2351 } else {
2352 resy = resx;
2353 }
2354 resxy =
2355 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2356 _mm_storeu_si128((__m128i *)(dst), resxy);
2357 dst += stride;
2358 }
2359 }
2360
highbd_dr_prediction_z2_Nx8_avx2(int N,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2361 static void highbd_dr_prediction_z2_Nx8_avx2(
2362 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2363 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2364 int dy) {
2365 const int min_base_x = -(1 << upsample_above);
2366 const int min_base_y = -(1 << upsample_left);
2367 const int frac_bits_x = 6 - upsample_above;
2368 const int frac_bits_y = 6 - upsample_left;
2369
2370 // pre-filter above pixels
2371 // store in temp buffers:
2372 // above[x] * 32 + 16
2373 // above[x+1] - above[x]
2374 // final pixels will be calculated as:
2375 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2376 __m128i c3f, min_base_y128;
2377 __m256i a0_x, a1_x, diff, a32, a16;
2378 __m128i a0_x128, a1_x128;
2379
2380 a16 = _mm256_set1_epi16(16);
2381 c3f = _mm_set1_epi16(0x3f);
2382 min_base_y128 = _mm_set1_epi16(min_base_y);
2383
2384 for (int r = 0; r < N; r++) {
2385 __m256i b, res, shift;
2386 __m128i resx, resy, resxy;
2387 int y = r + 1;
2388 int base_x = (-y * dx) >> frac_bits_x;
2389 int base_shift = 0;
2390 if (base_x < (min_base_x - 1)) {
2391 base_shift = (min_base_x - base_x - 1) >> upsample_above;
2392 }
2393 int base_min_diff =
2394 (min_base_x - base_x + upsample_above) >> upsample_above;
2395 if (base_min_diff > 8) {
2396 base_min_diff = 8;
2397 } else {
2398 if (base_min_diff < 0) base_min_diff = 0;
2399 }
2400
2401 if (base_shift > 7) {
2402 a0_x = _mm256_setzero_si256();
2403 a1_x = _mm256_setzero_si256();
2404 shift = _mm256_setzero_si256();
2405 } else {
2406 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2407 if (upsample_above) {
2408 __m128i mask, atmp0, atmp1, atmp2, atmp3;
2409 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
2410 atmp0 = _mm_shuffle_epi8(a0_x128,
2411 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2412 atmp1 = _mm_shuffle_epi8(a1_x128,
2413 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
2414 atmp2 = _mm_shuffle_epi8(
2415 a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2416 atmp3 = _mm_shuffle_epi8(
2417 a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
2418 mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
2419 _mm_set1_epi8(15));
2420 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
2421 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
2422 _mm_set1_epi8(15));
2423 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
2424
2425 shift = _mm256_castsi128_si256(_mm_srli_epi16(
2426 _mm_and_si128(
2427 _mm_slli_epi16(
2428 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2429 (2 << 6) - y * dx, (3 << 6) - y * dx,
2430 (4 << 6) - y * dx, (5 << 6) - y * dx,
2431 (6 << 6) - y * dx, (7 << 6) - y * dx),
2432 upsample_above),
2433 c3f),
2434 1));
2435 } else {
2436 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
2437 a0_x128 =
2438 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2439 a1_x128 =
2440 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2441
2442 shift = _mm256_castsi128_si256(_mm_srli_epi16(
2443 _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
2444 (2 << 6) - y * dx, (3 << 6) - y * dx,
2445 (4 << 6) - y * dx, (5 << 6) - y * dx,
2446 (6 << 6) - y * dx, (7 << 6) - y * dx),
2447 c3f),
2448 1));
2449 }
2450 a0_x = _mm256_castsi128_si256(a0_x128);
2451 a1_x = _mm256_castsi128_si256(a1_x128);
2452 }
2453
2454 // y calc
2455 __m128i a0_y, a1_y, shifty;
2456 if (base_x < min_base_x) {
2457 DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
2458 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
2459 r6 = _mm_set1_epi16(r << 6);
2460 dy128 = _mm_set1_epi16(dy);
2461 c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
2462 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
2463 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
2464 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
2465 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
2466 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
2467
2468 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
2469 left[base_y_c[2]], left[base_y_c[3]],
2470 left[base_y_c[4]], left[base_y_c[5]],
2471 left[base_y_c[6]], left[base_y_c[7]]);
2472 a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
2473 left[base_y_c[2] + 1], left[base_y_c[3] + 1],
2474 left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2475 left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
2476
2477 if (upsample_left) {
2478 shifty = _mm_srli_epi16(
2479 _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
2480 } else {
2481 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
2482 }
2483 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
2484 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
2485 shift = _mm256_inserti128_si256(shift, shifty, 1);
2486 }
2487
2488 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2489 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
2490 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2491
2492 b = _mm256_mullo_epi16(diff, shift);
2493 res = _mm256_add_epi16(a32, b);
2494 res = _mm256_srli_epi16(res, 5);
2495
2496 resx = _mm256_castsi256_si128(res);
2497 resy = _mm256_extracti128_si256(res, 1);
2498
2499 resxy =
2500 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
2501 _mm_storeu_si128((__m128i *)(dst), resxy);
2502 dst += stride;
2503 }
2504 }
2505
highbd_dr_prediction_32bit_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2506 static void highbd_dr_prediction_32bit_z2_HxW_avx2(
2507 int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2508 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2509 int dy) {
2510 // here upsample_above and upsample_left are 0 by design of
2511 // av1_use_intra_edge_upsample
2512 const int min_base_x = -1;
2513 const int min_base_y = -1;
2514 (void)upsample_above;
2515 (void)upsample_left;
2516 const int frac_bits_x = 6;
2517 const int frac_bits_y = 6;
2518
2519 // pre-filter above pixels
2520 // store in temp buffers:
2521 // above[x] * 32 + 16
2522 // above[x+1] - above[x]
2523 // final pixels will be calculated as:
2524 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2525 __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
2526 __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
2527 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2528 DECLARE_ALIGNED(32, int, base_y_c[16]);
2529
2530 a16 = _mm256_set1_epi32(16);
2531 c1 = _mm256_srli_epi32(a16, 4);
2532 c8 = _mm256_srli_epi32(a16, 1);
2533 min_base_y256 = _mm256_set1_epi32(min_base_y);
2534 c3f = _mm256_set1_epi32(0x3f);
2535 dy256 = _mm256_set1_epi32(dy);
2536 c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2537 c1234 = _mm256_add_epi32(c0123, c1);
2538
2539 for (int r = 0; r < H; r++) {
2540 __m256i b, res, shift, ydx;
2541 __m256i resx[2], resy[2];
2542 __m256i resxy, j256, r6;
2543 for (int j = 0; j < W; j += 16) {
2544 j256 = _mm256_set1_epi32(j);
2545 int y = r + 1;
2546 ydx = _mm256_set1_epi32(y * dx);
2547
2548 int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2549 int base_shift = 0;
2550 if ((base_x) < (min_base_x - 1)) {
2551 base_shift = (min_base_x - base_x - 1);
2552 }
2553 int base_min_diff = (min_base_x - base_x);
2554 if (base_min_diff > 16) {
2555 base_min_diff = 16;
2556 } else {
2557 if (base_min_diff < 0) base_min_diff = 0;
2558 }
2559
2560 if (base_shift > 7) {
2561 resx[0] = _mm256_setzero_si256();
2562 } else {
2563 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2564 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2565 a0_x128 =
2566 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2567 a1_x128 =
2568 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2569
2570 a0_x = _mm256_cvtepu16_epi32(a0_x128);
2571 a1_x = _mm256_cvtepu16_epi32(a1_x128);
2572
2573 r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
2574 shift = _mm256_srli_epi32(
2575 _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2576
2577 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
2578 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
2579 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2580
2581 b = _mm256_mullo_epi32(diff, shift);
2582 res = _mm256_add_epi32(a32, b);
2583 res = _mm256_srli_epi32(res, 5);
2584
2585 resx[0] = _mm256_packus_epi32(
2586 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2587 }
2588 int base_shift8 = 0;
2589 if ((base_x + 8) < (min_base_x - 1)) {
2590 base_shift8 = (min_base_x - (base_x + 8) - 1);
2591 }
2592 if (base_shift8 > 7) {
2593 resx[1] = _mm256_setzero_si256();
2594 } else {
2595 a0_1_x128 =
2596 _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
2597 a1_1_x128 =
2598 _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
2599 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2600 *(__m128i *)HighbdLoadMaskx[base_shift8]);
2601 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2602 *(__m128i *)HighbdLoadMaskx[base_shift8]);
2603
2604 a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
2605 a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
2606
2607 r6 = _mm256_slli_epi32(
2608 _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
2609 shift = _mm256_srli_epi32(
2610 _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
2611
2612 diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x]
2613 a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32
2614 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2615 b = _mm256_mullo_epi32(diff, shift);
2616
2617 resx[1] = _mm256_add_epi32(a32, b);
2618 resx[1] = _mm256_srli_epi32(resx[1], 5);
2619 resx[1] = _mm256_packus_epi32(
2620 resx[1],
2621 _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
2622 }
2623 resx[0] =
2624 _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
2625 1); // 16 16bit values
2626
2627 // y calc
2628 resy[0] = _mm256_setzero_si256();
2629 if ((base_x < min_base_x)) {
2630 __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
2631 r6 = _mm256_set1_epi32(r << 6);
2632 c256 = _mm256_add_epi32(j256, c1234);
2633 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2634 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
2635 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2636 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2637 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2638 c256 = _mm256_add_epi32(c256, c8);
2639 y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
2640 base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
2641 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
2642 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2643 _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
2644
2645 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2646 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2647 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2648 left[base_y_c[6]], left[base_y_c[7]]));
2649 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2650 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
2651 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
2652 left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
2653
2654 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
2655
2656 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
2657 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
2658 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2659
2660 b = _mm256_mullo_epi32(diff, shift);
2661 res = _mm256_add_epi32(a32, b);
2662 res = _mm256_srli_epi32(res, 5);
2663
2664 resy[0] = _mm256_packus_epi32(
2665 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2666
2667 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
2668 left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
2669 left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
2670 left[base_y_c[14]], left[base_y_c[15]]));
2671 a1_y = _mm256_cvtepu16_epi32(
2672 _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
2673 left[base_y_c[10] + 1], left[base_y_c[11] + 1],
2674 left[base_y_c[12] + 1], left[base_y_c[13] + 1],
2675 left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
2676 shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
2677
2678 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
2679 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
2680 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
2681
2682 b = _mm256_mullo_epi32(diff, shift);
2683 res = _mm256_add_epi32(a32, b);
2684 res = _mm256_srli_epi32(res, 5);
2685
2686 resy[1] = _mm256_packus_epi32(
2687 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
2688
2689 resy[0] =
2690 _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
2691 1); // 16 16bit values
2692 }
2693
2694 resxy = _mm256_blendv_epi8(resx[0], resy[0],
2695 *(__m256i *)HighbdBaseMask[base_min_diff]);
2696 _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2697 } // for j
2698 dst += stride;
2699 }
2700 }
2701
highbd_dr_prediction_z2_HxW_avx2(int H,int W,uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy)2702 static void highbd_dr_prediction_z2_HxW_avx2(
2703 int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
2704 const uint16_t *left, int upsample_above, int upsample_left, int dx,
2705 int dy) {
2706 // here upsample_above and upsample_left are 0 by design of
2707 // av1_use_intra_edge_upsample
2708 const int min_base_x = -1;
2709 const int min_base_y = -1;
2710 (void)upsample_above;
2711 (void)upsample_left;
2712 const int frac_bits_x = 6;
2713 const int frac_bits_y = 6;
2714
2715 // pre-filter above pixels
2716 // store in temp buffers:
2717 // above[x] * 32 + 16
2718 // above[x+1] - above[x]
2719 // final pixels will be calculated as:
2720 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
2721 __m256i a0_x, a1_x, a32, a16, c3f, c1;
2722 __m256i diff, min_base_y256, dy256, c1234, c0123;
2723 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
2724
2725 a16 = _mm256_set1_epi16(16);
2726 c1 = _mm256_srli_epi16(a16, 4);
2727 min_base_y256 = _mm256_set1_epi16(min_base_y);
2728 c3f = _mm256_set1_epi16(0x3f);
2729 dy256 = _mm256_set1_epi16(dy);
2730 c0123 =
2731 _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
2732 c1234 = _mm256_add_epi16(c0123, c1);
2733
2734 for (int r = 0; r < H; r++) {
2735 __m256i b, res, shift;
2736 __m256i resx, resy, ydx;
2737 __m256i resxy, j256, r6;
2738 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
2739 int y = r + 1;
2740 ydx = _mm256_set1_epi16((short)(y * dx));
2741
2742 for (int j = 0; j < W; j += 16) {
2743 j256 = _mm256_set1_epi16(j);
2744 int base_x = ((j << 6) - y * dx) >> frac_bits_x;
2745 int base_shift = 0;
2746 if ((base_x) < (min_base_x - 1)) {
2747 base_shift = (min_base_x - (base_x)-1);
2748 }
2749 int base_min_diff = (min_base_x - base_x);
2750 if (base_min_diff > 16) {
2751 base_min_diff = 16;
2752 } else {
2753 if (base_min_diff < 0) base_min_diff = 0;
2754 }
2755
2756 if (base_shift < 8) {
2757 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
2758 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
2759 a0_x128 =
2760 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2761 a1_x128 =
2762 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
2763
2764 a0_x = _mm256_castsi128_si256(a0_x128);
2765 a1_x = _mm256_castsi128_si256(a1_x128);
2766 } else {
2767 a0_x = _mm256_setzero_si256();
2768 a1_x = _mm256_setzero_si256();
2769 }
2770
2771 int base_shift1 = 0;
2772 if (base_shift > 8) {
2773 base_shift1 = base_shift - 8;
2774 }
2775 if (base_shift1 < 8) {
2776 a0_1_x128 =
2777 _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
2778 a1_1_x128 =
2779 _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
2780 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
2781 *(__m128i *)HighbdLoadMaskx[base_shift1]);
2782 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
2783 *(__m128i *)HighbdLoadMaskx[base_shift1]);
2784
2785 a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
2786 a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
2787 }
2788 r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
2789 shift = _mm256_srli_epi16(
2790 _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
2791
2792 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
2793 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
2794 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2795
2796 b = _mm256_mullo_epi16(diff, shift);
2797 res = _mm256_add_epi16(a32, b);
2798 resx = _mm256_srli_epi16(res, 5); // 16 16-bit values
2799
2800 // y calc
2801 resy = _mm256_setzero_si256();
2802 __m256i a0_y, a1_y, shifty;
2803 if ((base_x < min_base_x)) {
2804 __m256i c256, y_c256, base_y_c256, mask256, mul16;
2805 r6 = _mm256_set1_epi16(r << 6);
2806 c256 = _mm256_add_epi16(j256, c1234);
2807 mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
2808 _mm256_srli_epi16(min_base_y256, 1));
2809 y_c256 = _mm256_sub_epi16(r6, mul16);
2810 base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
2811 mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
2812 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
2813 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2814
2815 a0_y = _mm256_setr_epi16(
2816 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2817 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2818 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2819 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2820 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2821 left[base_y_c[15]]);
2822 base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
2823 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
2824
2825 a1_y = _mm256_setr_epi16(
2826 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
2827 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
2828 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
2829 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
2830 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
2831 left[base_y_c[15]]);
2832
2833 shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
2834
2835 diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
2836 a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
2837 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
2838
2839 b = _mm256_mullo_epi16(diff, shifty);
2840 res = _mm256_add_epi16(a32, b);
2841 resy = _mm256_srli_epi16(res, 5);
2842 }
2843
2844 resxy = _mm256_blendv_epi8(resx, resy,
2845 *(__m256i *)HighbdBaseMask[base_min_diff]);
2846 _mm256_storeu_si256((__m256i *)(dst + j), resxy);
2847 } // for j
2848 dst += stride;
2849 }
2850 }
2851
2852 // Directional prediction, zone 2: 90 < angle < 180
av1_highbd_dr_prediction_z2_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2853 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
2854 int bh, const uint16_t *above,
2855 const uint16_t *left, int upsample_above,
2856 int upsample_left, int dx, int dy,
2857 int bd) {
2858 (void)bd;
2859 assert(dx > 0);
2860 assert(dy > 0);
2861 switch (bw) {
2862 case 4:
2863 if (bd < 12) {
2864 highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
2865 upsample_above, upsample_left, dx, dy);
2866 } else {
2867 highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
2868 upsample_above, upsample_left,
2869 dx, dy);
2870 }
2871 break;
2872 case 8:
2873 if (bd < 12) {
2874 highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
2875 upsample_above, upsample_left, dx, dy);
2876 } else {
2877 highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
2878 upsample_above, upsample_left,
2879 dx, dy);
2880 }
2881 break;
2882 default:
2883 if (bd < 12) {
2884 highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2885 upsample_above, upsample_left, dx, dy);
2886 } else {
2887 highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
2888 upsample_above, upsample_left,
2889 dx, dy);
2890 }
2891 break;
2892 }
2893 }
2894
2895 // Directional prediction, zone 3 functions
highbd_dr_prediction_z3_4x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2896 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
2897 const uint16_t *left,
2898 int upsample_left, int dy,
2899 int bd) {
2900 __m128i dstvec[4], d[4];
2901 if (bd < 12) {
2902 highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
2903 dy);
2904 } else {
2905 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
2906 upsample_left, dy);
2907 }
2908 highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
2909 &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
2910 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
2911 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
2912 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
2913 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
2914 return;
2915 }
2916
highbd_dr_prediction_z3_8x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2917 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
2918 const uint16_t *left,
2919 int upsample_left, int dy,
2920 int bd) {
2921 __m128i dstvec[8], d[8];
2922 if (bd < 12) {
2923 highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
2924 dy);
2925 } else {
2926 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
2927 upsample_left, dy);
2928 }
2929 highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2930 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2931 &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2932 &d[7]);
2933 for (int i = 0; i < 8; i++) {
2934 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
2935 }
2936 }
2937
highbd_dr_prediction_z3_4x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2938 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
2939 const uint16_t *left,
2940 int upsample_left, int dy,
2941 int bd) {
2942 __m128i dstvec[4], d[8];
2943 if (bd < 12) {
2944 highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
2945 dy);
2946 } else {
2947 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
2948 upsample_left, dy);
2949 }
2950
2951 highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2952 &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
2953 &d[7]);
2954 for (int i = 0; i < 8; i++) {
2955 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
2956 }
2957 }
2958
highbd_dr_prediction_z3_8x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2959 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
2960 const uint16_t *left,
2961 int upsample_left, int dy,
2962 int bd) {
2963 __m128i dstvec[8], d[4];
2964 if (bd < 12) {
2965 highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
2966 dy);
2967 } else {
2968 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
2969 upsample_left, dy);
2970 }
2971
2972 highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
2973 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
2974 &d[0], &d[1], &d[2], &d[3]);
2975 _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
2976 _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
2977 _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
2978 _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
2979 }
2980
highbd_dr_prediction_z3_8x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)2981 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
2982 const uint16_t *left,
2983 int upsample_left, int dy,
2984 int bd) {
2985 __m256i dstvec[8], d[8];
2986 if (bd < 12) {
2987 highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
2988 dy);
2989 } else {
2990 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
2991 upsample_left, dy);
2992 }
2993 highbd_transpose8x16_16x8_avx2(dstvec, d);
2994 for (int i = 0; i < 8; i++) {
2995 _mm_storeu_si128((__m128i *)(dst + i * stride),
2996 _mm256_castsi256_si128(d[i]));
2997 }
2998 for (int i = 8; i < 16; i++) {
2999 _mm_storeu_si128((__m128i *)(dst + i * stride),
3000 _mm256_extracti128_si256(d[i - 8], 1));
3001 }
3002 }
3003
highbd_dr_prediction_z3_16x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3004 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
3005 const uint16_t *left,
3006 int upsample_left, int dy,
3007 int bd) {
3008 __m128i dstvec[16], d[16];
3009 if (bd < 12) {
3010 highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
3011 dy);
3012 } else {
3013 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
3014 upsample_left, dy);
3015 }
3016 for (int i = 0; i < 16; i += 8) {
3017 highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3018 &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3019 &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3020 &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3021 &d[5 + i], &d[6 + i], &d[7 + i]);
3022 }
3023 for (int i = 0; i < 8; i++) {
3024 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3025 _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3026 }
3027 }
3028
highbd_dr_prediction_z3_4x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3029 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
3030 const uint16_t *left,
3031 int upsample_left, int dy,
3032 int bd) {
3033 __m256i dstvec[4], d[4], d1;
3034 if (bd < 12) {
3035 highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
3036 dy);
3037 } else {
3038 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
3039 upsample_left, dy);
3040 }
3041 highbd_transpose4x16_avx2(dstvec, d);
3042 for (int i = 0; i < 4; i++) {
3043 _mm_storel_epi64((__m128i *)(dst + i * stride),
3044 _mm256_castsi256_si128(d[i]));
3045 d1 = _mm256_bsrli_epi128(d[i], 8);
3046 _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
3047 _mm256_castsi256_si128(d1));
3048 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
3049 _mm256_extracti128_si256(d[i], 1));
3050 _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
3051 _mm256_extracti128_si256(d1, 1));
3052 }
3053 }
3054
highbd_dr_prediction_z3_16x4_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3055 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
3056 const uint16_t *left,
3057 int upsample_left, int dy,
3058 int bd) {
3059 __m128i dstvec[16], d[8];
3060 if (bd < 12) {
3061 highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
3062 dy);
3063 } else {
3064 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
3065 upsample_left, dy);
3066 }
3067 highbd_transpose16x4_8x8_sse2(dstvec, d);
3068
3069 _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
3070 _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
3071 _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
3072 _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
3073 _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
3074 _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
3075 _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
3076 _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
3077 }
3078
highbd_dr_prediction_z3_8x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3079 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
3080 const uint16_t *left,
3081 int upsample_left, int dy,
3082 int bd) {
3083 __m256i dstvec[16], d[16];
3084 if (bd < 12) {
3085 highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
3086 dy);
3087 } else {
3088 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
3089 upsample_left, dy);
3090 }
3091
3092 for (int i = 0; i < 16; i += 8) {
3093 highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3094 }
3095
3096 for (int i = 0; i < 8; i++) {
3097 _mm_storeu_si128((__m128i *)(dst + i * stride),
3098 _mm256_castsi256_si128(d[i]));
3099 }
3100 for (int i = 0; i < 8; i++) {
3101 _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3102 _mm256_extracti128_si256(d[i], 1));
3103 }
3104 for (int i = 8; i < 16; i++) {
3105 _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
3106 _mm256_castsi256_si128(d[i]));
3107 }
3108 for (int i = 8; i < 16; i++) {
3109 _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
3110 _mm256_extracti128_si256(d[i], 1));
3111 }
3112 }
3113
highbd_dr_prediction_z3_32x8_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3114 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
3115 const uint16_t *left,
3116 int upsample_left, int dy,
3117 int bd) {
3118 __m128i dstvec[32], d[32];
3119 if (bd < 12) {
3120 highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
3121 dy);
3122 } else {
3123 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
3124 upsample_left, dy);
3125 }
3126
3127 for (int i = 0; i < 32; i += 8) {
3128 highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
3129 &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
3130 &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
3131 &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
3132 &d[5 + i], &d[6 + i], &d[7 + i]);
3133 }
3134 for (int i = 0; i < 8; i++) {
3135 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
3136 _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
3137 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
3138 _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
3139 }
3140 }
3141
highbd_dr_prediction_z3_16x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3142 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
3143 const uint16_t *left,
3144 int upsample_left, int dy,
3145 int bd) {
3146 __m256i dstvec[16], d[16];
3147 if (bd < 12) {
3148 highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
3149 dy);
3150 } else {
3151 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
3152 upsample_left, dy);
3153 }
3154
3155 highbd_transpose16x16_avx2(dstvec, d);
3156
3157 for (int i = 0; i < 16; i++) {
3158 _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
3159 }
3160 }
3161
highbd_dr_prediction_z3_32x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3162 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
3163 const uint16_t *left,
3164 int upsample_left, int dy,
3165 int bd) {
3166 __m256i dstvec[64], d[16];
3167 if (bd < 12) {
3168 highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
3169 dy);
3170 } else {
3171 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
3172 upsample_left, dy);
3173 }
3174 highbd_transpose16x16_avx2(dstvec, d);
3175 for (int j = 0; j < 16; j++) {
3176 _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
3177 }
3178 highbd_transpose16x16_avx2(dstvec + 16, d);
3179 for (int j = 0; j < 16; j++) {
3180 _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
3181 }
3182 highbd_transpose16x16_avx2(dstvec + 32, d);
3183 for (int j = 0; j < 16; j++) {
3184 _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
3185 }
3186 highbd_transpose16x16_avx2(dstvec + 48, d);
3187 for (int j = 0; j < 16; j++) {
3188 _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
3189 }
3190 }
3191
highbd_dr_prediction_z3_64x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3192 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
3193 const uint16_t *left,
3194 int upsample_left, int dy,
3195 int bd) {
3196 DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
3197 if (bd < 12) {
3198 highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
3199 } else {
3200 highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
3201 dy);
3202 }
3203 highbd_transpose(dstT, 64, dst, stride, 64, 64);
3204 }
3205
highbd_dr_prediction_z3_16x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3206 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
3207 const uint16_t *left,
3208 int upsample_left, int dy,
3209 int bd) {
3210 __m256i dstvec[32], d[32];
3211 if (bd < 12) {
3212 highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
3213 dy);
3214 } else {
3215 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
3216 upsample_left, dy);
3217 }
3218 for (int i = 0; i < 32; i += 8) {
3219 highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
3220 }
3221 // store
3222 for (int j = 0; j < 32; j += 16) {
3223 for (int i = 0; i < 8; i++) {
3224 _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
3225 _mm256_castsi256_si128(d[(i + j)]));
3226 }
3227 for (int i = 0; i < 8; i++) {
3228 _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
3229 _mm256_castsi256_si128(d[(i + j) + 8]));
3230 }
3231 for (int i = 8; i < 16; i++) {
3232 _mm256_storeu_si256(
3233 (__m256i *)(dst + (i + j) * stride),
3234 _mm256_inserti128_si256(
3235 d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
3236 }
3237 }
3238 }
3239
highbd_dr_prediction_z3_32x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3240 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
3241 const uint16_t *left,
3242 int upsample_left, int dy,
3243 int bd) {
3244 __m256i dstvec[32], d[16];
3245 if (bd < 12) {
3246 highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
3247 dy);
3248 } else {
3249 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
3250 upsample_left, dy);
3251 }
3252 for (int i = 0; i < 32; i += 16) {
3253 highbd_transpose16x16_avx2((dstvec + i), d);
3254 for (int j = 0; j < 16; j++) {
3255 _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3256 }
3257 }
3258 }
3259
highbd_dr_prediction_z3_32x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3260 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
3261 const uint16_t *left,
3262 int upsample_left, int dy,
3263 int bd) {
3264 uint16_t dstT[64 * 32];
3265 if (bd < 12) {
3266 highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
3267 } else {
3268 highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
3269 dy);
3270 }
3271 highbd_transpose(dstT, 64, dst, stride, 32, 64);
3272 }
3273
highbd_dr_prediction_z3_64x32_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3274 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
3275 const uint16_t *left,
3276 int upsample_left, int dy,
3277 int bd) {
3278 DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
3279 highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
3280 highbd_transpose(dstT, 32, dst, stride, 64, 32);
3281 return;
3282 }
3283
highbd_dr_prediction_z3_16x64_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3284 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
3285 const uint16_t *left,
3286 int upsample_left, int dy,
3287 int bd) {
3288 DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
3289 if (bd < 12) {
3290 highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
3291 } else {
3292 highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
3293 dy);
3294 }
3295 highbd_transpose(dstT, 64, dst, stride, 16, 64);
3296 }
3297
highbd_dr_prediction_z3_64x16_avx2(uint16_t * dst,ptrdiff_t stride,const uint16_t * left,int upsample_left,int dy,int bd)3298 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
3299 const uint16_t *left,
3300 int upsample_left, int dy,
3301 int bd) {
3302 __m256i dstvec[64], d[16];
3303 if (bd < 12) {
3304 highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
3305 dy);
3306 } else {
3307 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
3308 upsample_left, dy);
3309 }
3310 for (int i = 0; i < 64; i += 16) {
3311 highbd_transpose16x16_avx2((dstvec + i), d);
3312 for (int j = 0; j < 16; j++) {
3313 _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
3314 }
3315 }
3316 }
3317
av1_highbd_dr_prediction_z3_avx2(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_left,int dx,int dy,int bd)3318 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
3319 int bh, const uint16_t *above,
3320 const uint16_t *left, int upsample_left,
3321 int dx, int dy, int bd) {
3322 (void)above;
3323 (void)dx;
3324
3325 assert(dx == 1);
3326 assert(dy > 0);
3327 if (bw == bh) {
3328 switch (bw) {
3329 case 4:
3330 highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
3331 bd);
3332 break;
3333 case 8:
3334 highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
3335 bd);
3336 break;
3337 case 16:
3338 highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
3339 bd);
3340 break;
3341 case 32:
3342 highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
3343 bd);
3344 break;
3345 case 64:
3346 highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
3347 bd);
3348 break;
3349 }
3350 } else {
3351 if (bw < bh) {
3352 if (bw + bw == bh) {
3353 switch (bw) {
3354 case 4:
3355 highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
3356 dy, bd);
3357 break;
3358 case 8:
3359 highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
3360 dy, bd);
3361 break;
3362 case 16:
3363 highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
3364 dy, bd);
3365 break;
3366 case 32:
3367 highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
3368 dy, bd);
3369 break;
3370 }
3371 } else {
3372 switch (bw) {
3373 case 4:
3374 highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
3375 dy, bd);
3376 break;
3377 case 8:
3378 highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
3379 dy, bd);
3380 break;
3381 case 16:
3382 highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
3383 dy, bd);
3384 break;
3385 }
3386 }
3387 } else {
3388 if (bh + bh == bw) {
3389 switch (bh) {
3390 case 4:
3391 highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
3392 dy, bd);
3393 break;
3394 case 8:
3395 highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
3396 dy, bd);
3397 break;
3398 case 16:
3399 highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
3400 dy, bd);
3401 break;
3402 case 32:
3403 highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
3404 dy, bd);
3405 break;
3406 }
3407 } else {
3408 switch (bh) {
3409 case 4:
3410 highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
3411 dy, bd);
3412 break;
3413 case 8:
3414 highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
3415 dy, bd);
3416 break;
3417 case 16:
3418 highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
3419 dy, bd);
3420 break;
3421 }
3422 }
3423 }
3424 }
3425 return;
3426 }
3427
3428 // Low bit depth functions
3429 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
3430 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3431 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3432 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3433 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3434 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3435 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3436 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3437 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3438 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3439 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3440 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3441 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3442 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3443 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3444 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3445 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3446 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3447 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3448 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3449 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3450 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3451 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3452 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3453 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3454 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3455 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3456 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3457 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3459 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3460 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3462 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3463 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
3464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3465 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3466 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
3467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3468 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3469 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
3470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3471 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3472 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
3473 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3474 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3475 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
3476 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3477 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3478 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
3479 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3480 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3481 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
3482 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3483 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3484 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
3485 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3486 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3487 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3488 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3489 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3490 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3491 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
3492 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3493 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3494 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
3495 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3496 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3497 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
3498 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3499 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3500 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
3501 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3502 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3503 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
3504 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3505 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3506 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
3507 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3508 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3509 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
3510 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3511 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3512 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
3513 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3514 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3515 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
3516 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3517 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
3518 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
3519 };
3520
3521 /* clang-format on */
dr_prediction_z1_HxW_internal_avx2(int H,int W,__m128i * dst,const uint8_t * above,int upsample_above,int dx)3522 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
3523 int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
3524 int dx) {
3525 const int frac_bits = 6 - upsample_above;
3526 const int max_base_x = ((W + H) - 1) << upsample_above;
3527
3528 assert(dx > 0);
3529 // pre-filter above pixels
3530 // store in temp buffers:
3531 // above[x] * 32 + 16
3532 // above[x+1] - above[x]
3533 // final pixels will be calculated as:
3534 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3535 __m256i a0, a1, a32, a16;
3536 __m256i diff, c3f;
3537 __m128i a_mbase_x;
3538
3539 a16 = _mm256_set1_epi16(16);
3540 a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
3541 c3f = _mm256_set1_epi16(0x3f);
3542
3543 int x = dx;
3544 for (int r = 0; r < W; r++) {
3545 __m256i b, res, shift;
3546 __m128i res1, a0_128, a1_128;
3547
3548 int base = x >> frac_bits;
3549 int base_max_diff = (max_base_x - base) >> upsample_above;
3550 if (base_max_diff <= 0) {
3551 for (int i = r; i < W; ++i) {
3552 dst[i] = a_mbase_x; // save 4 values
3553 }
3554 return;
3555 }
3556 if (base_max_diff > H) base_max_diff = H;
3557 a0_128 = _mm_loadu_si128((__m128i *)(above + base));
3558 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
3559
3560 if (upsample_above) {
3561 a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
3562 a1_128 = _mm_srli_si128(a0_128, 8);
3563
3564 shift = _mm256_srli_epi16(
3565 _mm256_and_si256(
3566 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
3567 1);
3568 } else {
3569 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3570 }
3571 a0 = _mm256_cvtepu8_epi16(a0_128);
3572 a1 = _mm256_cvtepu8_epi16(a1_128);
3573
3574 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
3575 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
3576 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3577
3578 b = _mm256_mullo_epi16(diff, shift);
3579 res = _mm256_add_epi16(a32, b);
3580 res = _mm256_srli_epi16(res, 5);
3581
3582 res = _mm256_packus_epi16(
3583 res, _mm256_castsi128_si256(
3584 _mm256_extracti128_si256(res, 1))); // goto 8 bit
3585 res1 = _mm256_castsi256_si128(res); // 16 8bit values
3586
3587 dst[r] =
3588 _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
3589 x += dx;
3590 }
3591 }
3592
dr_prediction_z1_4xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3593 static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3594 const uint8_t *above, int upsample_above,
3595 int dx) {
3596 __m128i dstvec[16];
3597
3598 dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
3599 for (int i = 0; i < N; i++) {
3600 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
3601 }
3602 }
3603
dr_prediction_z1_8xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3604 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3605 const uint8_t *above, int upsample_above,
3606 int dx) {
3607 __m128i dstvec[32];
3608
3609 dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
3610 for (int i = 0; i < N; i++) {
3611 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
3612 }
3613 }
3614
dr_prediction_z1_16xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3615 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3616 const uint8_t *above, int upsample_above,
3617 int dx) {
3618 __m128i dstvec[64];
3619
3620 dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
3621 for (int i = 0; i < N; i++) {
3622 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
3623 }
3624 }
3625
dr_prediction_z1_32xN_internal_avx2(int N,__m256i * dstvec,const uint8_t * above,int upsample_above,int dx)3626 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
3627 int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
3628 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3629 (void)upsample_above;
3630 const int frac_bits = 6;
3631 const int max_base_x = ((32 + N) - 1);
3632
3633 // pre-filter above pixels
3634 // store in temp buffers:
3635 // above[x] * 32 + 16
3636 // above[x+1] - above[x]
3637 // final pixels will be calculated as:
3638 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3639 __m256i a0, a1, a32, a16;
3640 __m256i a_mbase_x, diff, c3f;
3641
3642 a16 = _mm256_set1_epi16(16);
3643 a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3644 c3f = _mm256_set1_epi16(0x3f);
3645
3646 int x = dx;
3647 for (int r = 0; r < N; r++) {
3648 __m256i b, res, res16[2];
3649 __m128i a0_128, a1_128;
3650
3651 int base = x >> frac_bits;
3652 int base_max_diff = (max_base_x - base);
3653 if (base_max_diff <= 0) {
3654 for (int i = r; i < N; ++i) {
3655 dstvec[i] = a_mbase_x; // save 32 values
3656 }
3657 return;
3658 }
3659 if (base_max_diff > 32) base_max_diff = 32;
3660 __m256i shift =
3661 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3662
3663 for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
3664 int mdiff = base_max_diff - j;
3665 if (mdiff <= 0) {
3666 res16[jj] = a_mbase_x;
3667 } else {
3668 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3669 a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
3670 a0 = _mm256_cvtepu8_epi16(a0_128);
3671 a1 = _mm256_cvtepu8_epi16(a1_128);
3672
3673 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
3674 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
3675 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3676 b = _mm256_mullo_epi16(diff, shift);
3677
3678 res = _mm256_add_epi16(a32, b);
3679 res = _mm256_srli_epi16(res, 5);
3680 res16[jj] = _mm256_packus_epi16(
3681 res, _mm256_castsi128_si256(
3682 _mm256_extracti128_si256(res, 1))); // 16 8bit values
3683 }
3684 }
3685 res16[1] =
3686 _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
3687 1); // 32 8bit values
3688
3689 dstvec[r] = _mm256_blendv_epi8(
3690 a_mbase_x, res16[1],
3691 *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values
3692 x += dx;
3693 }
3694 }
3695
dr_prediction_z1_32xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3696 static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3697 const uint8_t *above, int upsample_above,
3698 int dx) {
3699 __m256i dstvec[64];
3700 dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
3701 for (int i = 0; i < N; i++) {
3702 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
3703 }
3704 }
3705
dr_prediction_z1_64xN_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,int upsample_above,int dx)3706 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3707 const uint8_t *above, int upsample_above,
3708 int dx) {
3709 // here upsample_above is 0 by design of av1_use_intra_edge_upsample
3710 (void)upsample_above;
3711 const int frac_bits = 6;
3712 const int max_base_x = ((64 + N) - 1);
3713
3714 // pre-filter above pixels
3715 // store in temp buffers:
3716 // above[x] * 32 + 16
3717 // above[x+1] - above[x]
3718 // final pixels will be calculated as:
3719 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3720 __m256i a0, a1, a32, a16;
3721 __m256i a_mbase_x, diff, c3f;
3722 __m128i max_base_x128, base_inc128, mask128;
3723
3724 a16 = _mm256_set1_epi16(16);
3725 a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
3726 max_base_x128 = _mm_set1_epi8(max_base_x);
3727 c3f = _mm256_set1_epi16(0x3f);
3728
3729 int x = dx;
3730 for (int r = 0; r < N; r++, dst += stride) {
3731 __m256i b, res;
3732 int base = x >> frac_bits;
3733 if (base >= max_base_x) {
3734 for (int i = r; i < N; ++i) {
3735 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
3736 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
3737 dst += stride;
3738 }
3739 return;
3740 }
3741
3742 __m256i shift =
3743 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
3744
3745 __m128i a0_128, a1_128, res128;
3746 for (int j = 0; j < 64; j += 16) {
3747 int mdif = max_base_x - (base + j);
3748 if (mdif <= 0) {
3749 _mm_storeu_si128((__m128i *)(dst + j),
3750 _mm256_castsi256_si128(a_mbase_x));
3751 } else {
3752 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
3753 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
3754 a0 = _mm256_cvtepu8_epi16(a0_128);
3755 a1 = _mm256_cvtepu8_epi16(a1_128);
3756
3757 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
3758 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
3759 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
3760 b = _mm256_mullo_epi16(diff, shift);
3761
3762 res = _mm256_add_epi16(a32, b);
3763 res = _mm256_srli_epi16(res, 5);
3764 res = _mm256_packus_epi16(
3765 res, _mm256_castsi128_si256(
3766 _mm256_extracti128_si256(res, 1))); // 16 8bit values
3767
3768 base_inc128 =
3769 _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
3770 (int8_t)(base + j + 2), (int8_t)(base + j + 3),
3771 (int8_t)(base + j + 4), (int8_t)(base + j + 5),
3772 (int8_t)(base + j + 6), (int8_t)(base + j + 7),
3773 (int8_t)(base + j + 8), (int8_t)(base + j + 9),
3774 (int8_t)(base + j + 10), (int8_t)(base + j + 11),
3775 (int8_t)(base + j + 12), (int8_t)(base + j + 13),
3776 (int8_t)(base + j + 14), (int8_t)(base + j + 15));
3777
3778 mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
3779 _mm_setzero_si128());
3780 res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
3781 _mm256_castsi256_si128(res), mask128);
3782 _mm_storeu_si128((__m128i *)(dst + j), res128);
3783 }
3784 }
3785 x += dx;
3786 }
3787 }
3788
3789 // Directional prediction, zone 1: 0 < angle < 90
av1_dr_prediction_z1_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int dx,int dy)3790 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
3791 const uint8_t *above, const uint8_t *left,
3792 int upsample_above, int dx, int dy) {
3793 (void)left;
3794 (void)dy;
3795 switch (bw) {
3796 case 4:
3797 dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
3798 break;
3799 case 8:
3800 dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
3801 break;
3802 case 16:
3803 dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
3804 break;
3805 case 32:
3806 dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
3807 break;
3808 case 64:
3809 dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
3810 break;
3811 default: break;
3812 }
3813 return;
3814 }
3815
dr_prediction_z2_Nx4_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3816 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3817 const uint8_t *above, const uint8_t *left,
3818 int upsample_above, int upsample_left,
3819 int dx, int dy) {
3820 const int min_base_x = -(1 << upsample_above);
3821 const int min_base_y = -(1 << upsample_left);
3822 const int frac_bits_x = 6 - upsample_above;
3823 const int frac_bits_y = 6 - upsample_left;
3824
3825 assert(dx > 0);
3826 // pre-filter above pixels
3827 // store in temp buffers:
3828 // above[x] * 32 + 16
3829 // above[x+1] - above[x]
3830 // final pixels will be calculated as:
3831 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3832 __m128i a0_x, a1_x, a32, a16, diff;
3833 __m128i c3f, min_base_y128, c1234, dy128;
3834
3835 a16 = _mm_set1_epi16(16);
3836 c3f = _mm_set1_epi16(0x3f);
3837 min_base_y128 = _mm_set1_epi16(min_base_y);
3838 c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
3839 dy128 = _mm_set1_epi16(dy);
3840
3841 for (int r = 0; r < N; r++) {
3842 __m128i b, res, shift, r6, ydx;
3843 __m128i resx, resy, resxy;
3844 __m128i a0_x128, a1_x128;
3845 int y = r + 1;
3846 int base_x = (-y * dx) >> frac_bits_x;
3847 int base_shift = 0;
3848 if (base_x < (min_base_x - 1)) {
3849 base_shift = (min_base_x - base_x - 1) >> upsample_above;
3850 }
3851 int base_min_diff =
3852 (min_base_x - base_x + upsample_above) >> upsample_above;
3853 if (base_min_diff > 4) {
3854 base_min_diff = 4;
3855 } else {
3856 if (base_min_diff < 0) base_min_diff = 0;
3857 }
3858
3859 if (base_shift > 3) {
3860 a0_x = _mm_setzero_si128();
3861 a1_x = _mm_setzero_si128();
3862 shift = _mm_setzero_si128();
3863 } else {
3864 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3865 ydx = _mm_set1_epi16(y * dx);
3866 r6 = _mm_slli_epi16(c1234, 6);
3867
3868 if (upsample_above) {
3869 a0_x128 =
3870 _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3871 a1_x128 = _mm_srli_si128(a0_x128, 8);
3872
3873 shift = _mm_srli_epi16(
3874 _mm_and_si128(
3875 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3876 1);
3877 } else {
3878 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3879 a1_x128 = _mm_srli_si128(a0_x128, 1);
3880
3881 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
3882 }
3883 a0_x = _mm_cvtepu8_epi16(a0_x128);
3884 a1_x = _mm_cvtepu8_epi16(a1_x128);
3885 }
3886 // y calc
3887 __m128i a0_y, a1_y, shifty;
3888 if (base_x < min_base_x) {
3889 DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
3890 __m128i y_c128, base_y_c128, mask128, c1234_;
3891 c1234_ = _mm_srli_si128(c1234, 2);
3892 r6 = _mm_set1_epi16(r << 6);
3893 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
3894 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
3895 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
3896 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
3897 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3898
3899 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3900 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3901 base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
3902 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
3903 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
3904 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
3905
3906 if (upsample_left) {
3907 shifty = _mm_srli_epi16(
3908 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
3909 } else {
3910 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
3911 }
3912 a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
3913 a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
3914 shift = _mm_unpacklo_epi64(shift, shifty);
3915 }
3916
3917 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
3918 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
3919 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
3920
3921 b = _mm_mullo_epi16(diff, shift);
3922 res = _mm_add_epi16(a32, b);
3923 res = _mm_srli_epi16(res, 5);
3924
3925 resx = _mm_packus_epi16(res, res);
3926 resy = _mm_srli_si128(resx, 4);
3927
3928 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
3929 *(int *)(dst) = _mm_cvtsi128_si32(resxy);
3930 dst += stride;
3931 }
3932 }
3933
dr_prediction_z2_Nx8_avx2(int N,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)3934 static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
3935 const uint8_t *above, const uint8_t *left,
3936 int upsample_above, int upsample_left,
3937 int dx, int dy) {
3938 const int min_base_x = -(1 << upsample_above);
3939 const int min_base_y = -(1 << upsample_left);
3940 const int frac_bits_x = 6 - upsample_above;
3941 const int frac_bits_y = 6 - upsample_left;
3942
3943 // pre-filter above pixels
3944 // store in temp buffers:
3945 // above[x] * 32 + 16
3946 // above[x+1] - above[x]
3947 // final pixels will be calculated as:
3948 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
3949 __m256i diff, a32, a16;
3950 __m256i a0_x, a1_x;
3951 __m128i a0_x128, a1_x128, min_base_y128, c3f;
3952 __m128i c1234, dy128;
3953
3954 a16 = _mm256_set1_epi16(16);
3955 c3f = _mm_set1_epi16(0x3f);
3956 min_base_y128 = _mm_set1_epi16(min_base_y);
3957 dy128 = _mm_set1_epi16(dy);
3958 c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
3959
3960 for (int r = 0; r < N; r++) {
3961 __m256i b, res, shift;
3962 __m128i resx, resy, resxy, r6, ydx;
3963
3964 int y = r + 1;
3965 int base_x = (-y * dx) >> frac_bits_x;
3966 int base_shift = 0;
3967 if (base_x < (min_base_x - 1)) {
3968 base_shift = (min_base_x - base_x - 1) >> upsample_above;
3969 }
3970 int base_min_diff =
3971 (min_base_x - base_x + upsample_above) >> upsample_above;
3972 if (base_min_diff > 8) {
3973 base_min_diff = 8;
3974 } else {
3975 if (base_min_diff < 0) base_min_diff = 0;
3976 }
3977
3978 if (base_shift > 7) {
3979 a0_x = _mm256_setzero_si256();
3980 a1_x = _mm256_setzero_si256();
3981 shift = _mm256_setzero_si256();
3982 } else {
3983 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
3984 ydx = _mm_set1_epi16(y * dx);
3985 r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
3986 if (upsample_above) {
3987 a0_x128 =
3988 _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
3989 a1_x128 = _mm_srli_si128(a0_x128, 8);
3990
3991 shift = _mm256_castsi128_si256(_mm_srli_epi16(
3992 _mm_and_si128(
3993 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
3994 1));
3995 } else {
3996 a1_x128 = _mm_srli_si128(a0_x128, 1);
3997 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
3998 a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
3999
4000 shift = _mm256_castsi128_si256(
4001 _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
4002 }
4003 a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
4004 a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
4005 }
4006
4007 // y calc
4008 __m128i a0_y, a1_y, shifty;
4009 if (base_x < min_base_x) {
4010 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4011 __m128i y_c128, base_y_c128, mask128;
4012 r6 = _mm_set1_epi16(r << 6);
4013 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
4014 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
4015 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
4016 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
4017 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4018
4019 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4020 left[base_y_c[2]], left[base_y_c[3]],
4021 left[base_y_c[4]], left[base_y_c[5]],
4022 left[base_y_c[6]], left[base_y_c[7]]);
4023 base_y_c128 = _mm_add_epi16(
4024 base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
4025 _mm_store_si128((__m128i *)base_y_c, base_y_c128);
4026
4027 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
4028 left[base_y_c[2]], left[base_y_c[3]],
4029 left[base_y_c[4]], left[base_y_c[5]],
4030 left[base_y_c[6]], left[base_y_c[7]]);
4031
4032 if (upsample_left) {
4033 shifty = _mm_srli_epi16(
4034 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
4035 } else {
4036 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
4037 }
4038
4039 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
4040 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
4041 shift = _mm256_inserti128_si256(shift, shifty, 1);
4042 }
4043
4044 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
4045 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
4046 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
4047
4048 b = _mm256_mullo_epi16(diff, shift);
4049 res = _mm256_add_epi16(a32, b);
4050 res = _mm256_srli_epi16(res, 5);
4051
4052 resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
4053 _mm256_castsi256_si128(res));
4054 resy = _mm256_extracti128_si256(res, 1);
4055 resy = _mm_packus_epi16(resy, resy);
4056
4057 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4058 _mm_storel_epi64((__m128i *)(dst), resxy);
4059 dst += stride;
4060 }
4061 }
4062
dr_prediction_z2_HxW_avx2(int H,int W,uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4063 static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
4064 ptrdiff_t stride, const uint8_t *above,
4065 const uint8_t *left, int upsample_above,
4066 int upsample_left, int dx, int dy) {
4067 // here upsample_above and upsample_left are 0 by design of
4068 // av1_use_intra_edge_upsample
4069 const int min_base_x = -1;
4070 const int min_base_y = -1;
4071 (void)upsample_above;
4072 (void)upsample_left;
4073 const int frac_bits_x = 6;
4074 const int frac_bits_y = 6;
4075
4076 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
4077 __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
4078 __m128i a0_x128, a1_x128;
4079
4080 DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
4081 a16 = _mm256_set1_epi16(16);
4082 c1 = _mm256_srli_epi16(a16, 4);
4083 min_base_y256 = _mm256_set1_epi16(min_base_y);
4084 c3f = _mm256_set1_epi16(0x3f);
4085 dy256 = _mm256_set1_epi16(dy);
4086 c0123 =
4087 _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
4088 c1234 = _mm256_add_epi16(c0123, c1);
4089
4090 for (int r = 0; r < H; r++) {
4091 __m256i b, res, shift, j256, r6, ydx;
4092 __m128i resx, resy;
4093 __m128i resxy;
4094 int y = r + 1;
4095 ydx = _mm256_set1_epi16((int16_t)(y * dx));
4096
4097 int base_x = (-y * dx) >> frac_bits_x;
4098 for (int j = 0; j < W; j += 16) {
4099 j256 = _mm256_set1_epi16(j);
4100 int base_shift = 0;
4101 if ((base_x + j) < (min_base_x - 1)) {
4102 base_shift = (min_base_x - (base_x + j) - 1);
4103 }
4104 int base_min_diff = (min_base_x - base_x - j);
4105 if (base_min_diff > 16) {
4106 base_min_diff = 16;
4107 } else {
4108 if (base_min_diff < 0) base_min_diff = 0;
4109 }
4110
4111 if (base_shift < 16) {
4112 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
4113 a1_x128 =
4114 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
4115 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
4116 a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
4117
4118 a0_x = _mm256_cvtepu8_epi16(a0_x128);
4119 a1_x = _mm256_cvtepu8_epi16(a1_x128);
4120
4121 r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
4122 shift = _mm256_srli_epi16(
4123 _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
4124
4125 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
4126 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
4127 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
4128
4129 b = _mm256_mullo_epi16(diff, shift);
4130 res = _mm256_add_epi16(a32, b);
4131 res = _mm256_srli_epi16(res, 5); // 16 16-bit values
4132 resx = _mm256_castsi256_si128(_mm256_packus_epi16(
4133 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4134 } else {
4135 resx = _mm_setzero_si128();
4136 }
4137
4138 // y calc
4139 if (base_x < min_base_x) {
4140 __m256i c256, y_c256, base_y_c256, mask256, mul16;
4141 r6 = _mm256_set1_epi16(r << 6);
4142 c256 = _mm256_add_epi16(j256, c1234);
4143 mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
4144 _mm256_srli_epi16(min_base_y256, 1));
4145 y_c256 = _mm256_sub_epi16(r6, mul16);
4146
4147 base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
4148 mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
4149
4150 base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
4151 int16_t min_y = (int16_t)_mm_extract_epi16(
4152 _mm256_extracti128_si256(base_y_c256, 1), 7);
4153 int16_t max_y =
4154 (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
4155 int16_t offset_diff = max_y - min_y;
4156
4157 if (offset_diff < 16) {
4158 __m256i min_y256 = _mm256_set1_epi16(min_y);
4159
4160 __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
4161 __m128i base_y_offset128 =
4162 _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
4163 _mm256_extracti128_si256(base_y_offset, 1));
4164
4165 __m128i a0_y128 = _mm_maskload_epi32(
4166 (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
4167 __m128i a1_y128 =
4168 _mm_maskload_epi32((int *)(left + min_y + 1),
4169 *(__m128i *)LoadMaskz2[offset_diff / 4]);
4170 a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
4171 a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
4172 a0_y = _mm256_cvtepu8_epi16(a0_y128);
4173 a1_y = _mm256_cvtepu8_epi16(a1_y128);
4174 } else {
4175 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
4176 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4177
4178 a0_y = _mm256_setr_epi16(
4179 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4180 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4181 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4182 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4183 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4184 left[base_y_c[15]]);
4185 base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
4186 _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
4187
4188 a1_y = _mm256_setr_epi16(
4189 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
4190 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
4191 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
4192 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
4193 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
4194 left[base_y_c[15]]);
4195 }
4196 shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
4197
4198 diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
4199 a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
4200 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
4201
4202 b = _mm256_mullo_epi16(diff, shifty);
4203 res = _mm256_add_epi16(a32, b);
4204 res = _mm256_srli_epi16(res, 5); // 16 16-bit values
4205 resy = _mm256_castsi256_si128(_mm256_packus_epi16(
4206 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
4207 } else {
4208 resy = _mm_setzero_si128();
4209 }
4210 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
4211 _mm_storeu_si128((__m128i *)(dst + j), resxy);
4212 } // for j
4213 dst += stride;
4214 }
4215 }
4216
4217 // Directional prediction, zone 2: 90 < angle < 180
av1_dr_prediction_z2_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_above,int upsample_left,int dx,int dy)4218 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4219 const uint8_t *above, const uint8_t *left,
4220 int upsample_above, int upsample_left, int dx,
4221 int dy) {
4222 assert(dx > 0);
4223 assert(dy > 0);
4224 switch (bw) {
4225 case 4:
4226 dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
4227 upsample_left, dx, dy);
4228 break;
4229 case 8:
4230 dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
4231 upsample_left, dx, dy);
4232 break;
4233 default:
4234 dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
4235 upsample_above, upsample_left, dx, dy);
4236 break;
4237 }
4238 return;
4239 }
4240
4241 // z3 functions
transpose16x32_avx2(__m256i * x,__m256i * d)4242 static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
4243 __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
4244 __m256i w10, w11, w12, w13, w14, w15;
4245
4246 w0 = _mm256_unpacklo_epi8(x[0], x[1]);
4247 w1 = _mm256_unpacklo_epi8(x[2], x[3]);
4248 w2 = _mm256_unpacklo_epi8(x[4], x[5]);
4249 w3 = _mm256_unpacklo_epi8(x[6], x[7]);
4250
4251 w8 = _mm256_unpacklo_epi8(x[8], x[9]);
4252 w9 = _mm256_unpacklo_epi8(x[10], x[11]);
4253 w10 = _mm256_unpacklo_epi8(x[12], x[13]);
4254 w11 = _mm256_unpacklo_epi8(x[14], x[15]);
4255
4256 w4 = _mm256_unpacklo_epi16(w0, w1);
4257 w5 = _mm256_unpacklo_epi16(w2, w3);
4258 w12 = _mm256_unpacklo_epi16(w8, w9);
4259 w13 = _mm256_unpacklo_epi16(w10, w11);
4260
4261 w6 = _mm256_unpacklo_epi32(w4, w5);
4262 w7 = _mm256_unpackhi_epi32(w4, w5);
4263 w14 = _mm256_unpacklo_epi32(w12, w13);
4264 w15 = _mm256_unpackhi_epi32(w12, w13);
4265
4266 // Store first 4-line result
4267 d[0] = _mm256_unpacklo_epi64(w6, w14);
4268 d[1] = _mm256_unpackhi_epi64(w6, w14);
4269 d[2] = _mm256_unpacklo_epi64(w7, w15);
4270 d[3] = _mm256_unpackhi_epi64(w7, w15);
4271
4272 w4 = _mm256_unpackhi_epi16(w0, w1);
4273 w5 = _mm256_unpackhi_epi16(w2, w3);
4274 w12 = _mm256_unpackhi_epi16(w8, w9);
4275 w13 = _mm256_unpackhi_epi16(w10, w11);
4276
4277 w6 = _mm256_unpacklo_epi32(w4, w5);
4278 w7 = _mm256_unpackhi_epi32(w4, w5);
4279 w14 = _mm256_unpacklo_epi32(w12, w13);
4280 w15 = _mm256_unpackhi_epi32(w12, w13);
4281
4282 // Store second 4-line result
4283 d[4] = _mm256_unpacklo_epi64(w6, w14);
4284 d[5] = _mm256_unpackhi_epi64(w6, w14);
4285 d[6] = _mm256_unpacklo_epi64(w7, w15);
4286 d[7] = _mm256_unpackhi_epi64(w7, w15);
4287
4288 // upper half
4289 w0 = _mm256_unpackhi_epi8(x[0], x[1]);
4290 w1 = _mm256_unpackhi_epi8(x[2], x[3]);
4291 w2 = _mm256_unpackhi_epi8(x[4], x[5]);
4292 w3 = _mm256_unpackhi_epi8(x[6], x[7]);
4293
4294 w8 = _mm256_unpackhi_epi8(x[8], x[9]);
4295 w9 = _mm256_unpackhi_epi8(x[10], x[11]);
4296 w10 = _mm256_unpackhi_epi8(x[12], x[13]);
4297 w11 = _mm256_unpackhi_epi8(x[14], x[15]);
4298
4299 w4 = _mm256_unpacklo_epi16(w0, w1);
4300 w5 = _mm256_unpacklo_epi16(w2, w3);
4301 w12 = _mm256_unpacklo_epi16(w8, w9);
4302 w13 = _mm256_unpacklo_epi16(w10, w11);
4303
4304 w6 = _mm256_unpacklo_epi32(w4, w5);
4305 w7 = _mm256_unpackhi_epi32(w4, w5);
4306 w14 = _mm256_unpacklo_epi32(w12, w13);
4307 w15 = _mm256_unpackhi_epi32(w12, w13);
4308
4309 // Store first 4-line result
4310 d[8] = _mm256_unpacklo_epi64(w6, w14);
4311 d[9] = _mm256_unpackhi_epi64(w6, w14);
4312 d[10] = _mm256_unpacklo_epi64(w7, w15);
4313 d[11] = _mm256_unpackhi_epi64(w7, w15);
4314
4315 w4 = _mm256_unpackhi_epi16(w0, w1);
4316 w5 = _mm256_unpackhi_epi16(w2, w3);
4317 w12 = _mm256_unpackhi_epi16(w8, w9);
4318 w13 = _mm256_unpackhi_epi16(w10, w11);
4319
4320 w6 = _mm256_unpacklo_epi32(w4, w5);
4321 w7 = _mm256_unpackhi_epi32(w4, w5);
4322 w14 = _mm256_unpacklo_epi32(w12, w13);
4323 w15 = _mm256_unpackhi_epi32(w12, w13);
4324
4325 // Store second 4-line result
4326 d[12] = _mm256_unpacklo_epi64(w6, w14);
4327 d[13] = _mm256_unpackhi_epi64(w6, w14);
4328 d[14] = _mm256_unpacklo_epi64(w7, w15);
4329 d[15] = _mm256_unpackhi_epi64(w7, w15);
4330 }
4331
dr_prediction_z3_4x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4332 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
4333 const uint8_t *left, int upsample_left,
4334 int dy) {
4335 __m128i dstvec[4], d[4];
4336
4337 dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
4338 transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4339 &d[0], &d[1], &d[2], &d[3]);
4340
4341 *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
4342 *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
4343 *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
4344 *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
4345 return;
4346 }
4347
dr_prediction_z3_8x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4348 static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
4349 const uint8_t *left, int upsample_left,
4350 int dy) {
4351 __m128i dstvec[8], d[8];
4352
4353 dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
4354 transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
4355 &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
4356 &d[3]);
4357
4358 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4359 _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
4360 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
4361 _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
4362 _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
4363 _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
4364 _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
4365 _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
4366 }
4367
dr_prediction_z3_4x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4368 static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
4369 const uint8_t *left, int upsample_left,
4370 int dy) {
4371 __m128i dstvec[4], d[8];
4372
4373 dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
4374 transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
4375 &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
4376 for (int i = 0; i < 8; i++) {
4377 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4378 }
4379 }
4380
dr_prediction_z3_8x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4381 static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
4382 const uint8_t *left, int upsample_left,
4383 int dy) {
4384 __m128i dstvec[8], d[4];
4385
4386 dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
4387 transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
4388 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
4389 &d[1], &d[2], &d[3]);
4390 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
4391 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
4392 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
4393 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
4394 }
4395
dr_prediction_z3_8x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4396 static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
4397 const uint8_t *left, int upsample_left,
4398 int dy) {
4399 __m128i dstvec[8], d[8];
4400
4401 dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
4402 transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
4403 dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
4404 d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
4405 for (int i = 0; i < 8; i++) {
4406 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
4407 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
4408 _mm_srli_si128(d[i], 8));
4409 }
4410 }
4411
dr_prediction_z3_16x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4412 static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
4413 const uint8_t *left, int upsample_left,
4414 int dy) {
4415 __m128i dstvec[16], d[16];
4416
4417 dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
4418 transpose16x8_8x16_sse2(
4419 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4420 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4421 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4422 &d[3], &d[4], &d[5], &d[6], &d[7]);
4423
4424 for (int i = 0; i < 8; i++) {
4425 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4426 }
4427 }
4428
dr_prediction_z3_4x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4429 static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
4430 const uint8_t *left, int upsample_left,
4431 int dy) {
4432 __m128i dstvec[4], d[16];
4433
4434 dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
4435 transpose4x16_sse2(dstvec, d);
4436 for (int i = 0; i < 16; i++) {
4437 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
4438 }
4439 }
4440
dr_prediction_z3_16x4_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4441 static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
4442 const uint8_t *left, int upsample_left,
4443 int dy) {
4444 __m128i dstvec[16], d[8];
4445
4446 dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
4447 for (int i = 4; i < 8; i++) {
4448 d[i] = _mm_setzero_si128();
4449 }
4450 transpose16x8_8x16_sse2(
4451 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4452 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4453 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4454 &d[3], &d[4], &d[5], &d[6], &d[7]);
4455
4456 for (int i = 0; i < 4; i++) {
4457 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4458 }
4459 }
4460
dr_prediction_z3_8x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4461 static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
4462 const uint8_t *left, int upsample_left,
4463 int dy) {
4464 __m256i dstvec[16], d[16];
4465
4466 dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
4467 for (int i = 8; i < 16; i++) {
4468 dstvec[i] = _mm256_setzero_si256();
4469 }
4470 transpose16x32_avx2(dstvec, d);
4471
4472 for (int i = 0; i < 16; i++) {
4473 _mm_storel_epi64((__m128i *)(dst + i * stride),
4474 _mm256_castsi256_si128(d[i]));
4475 }
4476 for (int i = 0; i < 16; i++) {
4477 _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
4478 _mm256_extracti128_si256(d[i], 1));
4479 }
4480 }
4481
dr_prediction_z3_32x8_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4482 static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
4483 const uint8_t *left, int upsample_left,
4484 int dy) {
4485 __m128i dstvec[32], d[16];
4486
4487 dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
4488
4489 transpose16x8_8x16_sse2(
4490 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
4491 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
4492 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
4493 &d[3], &d[4], &d[5], &d[6], &d[7]);
4494 transpose16x8_8x16_sse2(
4495 &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
4496 &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
4497 &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
4498 &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
4499 &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
4500 &d[6 + 8], &d[7 + 8]);
4501
4502 for (int i = 0; i < 8; i++) {
4503 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4504 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
4505 }
4506 }
4507
dr_prediction_z3_16x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4508 static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
4509 const uint8_t *left, int upsample_left,
4510 int dy) {
4511 __m128i dstvec[16], d[16];
4512
4513 dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
4514 transpose16x16_sse2(dstvec, d);
4515
4516 for (int i = 0; i < 16; i++) {
4517 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
4518 }
4519 }
4520
dr_prediction_z3_32x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4521 static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
4522 const uint8_t *left, int upsample_left,
4523 int dy) {
4524 __m256i dstvec[32], d[32];
4525
4526 dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
4527 transpose16x32_avx2(dstvec, d);
4528 transpose16x32_avx2(dstvec + 16, d + 16);
4529 for (int j = 0; j < 16; j++) {
4530 _mm_storeu_si128((__m128i *)(dst + j * stride),
4531 _mm256_castsi256_si128(d[j]));
4532 _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
4533 _mm256_castsi256_si128(d[j + 16]));
4534 }
4535 for (int j = 0; j < 16; j++) {
4536 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4537 _mm256_extracti128_si256(d[j], 1));
4538 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
4539 _mm256_extracti128_si256(d[j + 16], 1));
4540 }
4541 }
4542
dr_prediction_z3_64x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4543 static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
4544 const uint8_t *left, int upsample_left,
4545 int dy) {
4546 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
4547 dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
4548 transpose(dstT, 64, dst, stride, 64, 64);
4549 }
4550
dr_prediction_z3_16x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4551 static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
4552 const uint8_t *left, int upsample_left,
4553 int dy) {
4554 __m256i dstvec[16], d[16];
4555
4556 dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
4557 transpose16x32_avx2(dstvec, d);
4558 // store
4559 for (int j = 0; j < 16; j++) {
4560 _mm_storeu_si128((__m128i *)(dst + j * stride),
4561 _mm256_castsi256_si128(d[j]));
4562 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
4563 _mm256_extracti128_si256(d[j], 1));
4564 }
4565 }
4566
dr_prediction_z3_32x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4567 static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
4568 const uint8_t *left, int upsample_left,
4569 int dy) {
4570 __m128i dstvec[32], d[16];
4571
4572 dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
4573 for (int i = 0; i < 32; i += 16) {
4574 transpose16x16_sse2((dstvec + i), d);
4575 for (int j = 0; j < 16; j++) {
4576 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4577 }
4578 }
4579 }
4580
dr_prediction_z3_32x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4581 static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
4582 const uint8_t *left, int upsample_left,
4583 int dy) {
4584 uint8_t dstT[64 * 32];
4585 dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
4586 transpose(dstT, 64, dst, stride, 32, 64);
4587 }
4588
dr_prediction_z3_64x32_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4589 static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
4590 const uint8_t *left, int upsample_left,
4591 int dy) {
4592 uint8_t dstT[32 * 64];
4593 dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
4594 transpose(dstT, 32, dst, stride, 64, 32);
4595 return;
4596 }
4597
dr_prediction_z3_16x64_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4598 static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
4599 const uint8_t *left, int upsample_left,
4600 int dy) {
4601 uint8_t dstT[64 * 16];
4602 dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
4603 transpose(dstT, 64, dst, stride, 16, 64);
4604 }
4605
dr_prediction_z3_64x16_avx2(uint8_t * dst,ptrdiff_t stride,const uint8_t * left,int upsample_left,int dy)4606 static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
4607 const uint8_t *left, int upsample_left,
4608 int dy) {
4609 __m128i dstvec[64], d[16];
4610
4611 dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
4612 for (int i = 0; i < 64; i += 16) {
4613 transpose16x16_sse2((dstvec + i), d);
4614 for (int j = 0; j < 16; j++) {
4615 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
4616 }
4617 }
4618 }
4619
av1_dr_prediction_z3_avx2(uint8_t * dst,ptrdiff_t stride,int bw,int bh,const uint8_t * above,const uint8_t * left,int upsample_left,int dx,int dy)4620 void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
4621 const uint8_t *above, const uint8_t *left,
4622 int upsample_left, int dx, int dy) {
4623 (void)above;
4624 (void)dx;
4625 assert(dx == 1);
4626 assert(dy > 0);
4627
4628 if (bw == bh) {
4629 switch (bw) {
4630 case 4:
4631 dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
4632 break;
4633 case 8:
4634 dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
4635 break;
4636 case 16:
4637 dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
4638 break;
4639 case 32:
4640 dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
4641 break;
4642 case 64:
4643 dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
4644 break;
4645 }
4646 } else {
4647 if (bw < bh) {
4648 if (bw + bw == bh) {
4649 switch (bw) {
4650 case 4:
4651 dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
4652 break;
4653 case 8:
4654 dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
4655 break;
4656 case 16:
4657 dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
4658 break;
4659 case 32:
4660 dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
4661 break;
4662 }
4663 } else {
4664 switch (bw) {
4665 case 4:
4666 dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
4667 break;
4668 case 8:
4669 dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
4670 break;
4671 case 16:
4672 dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
4673 break;
4674 }
4675 }
4676 } else {
4677 if (bh + bh == bw) {
4678 switch (bh) {
4679 case 4:
4680 dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
4681 break;
4682 case 8:
4683 dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
4684 break;
4685 case 16:
4686 dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
4687 break;
4688 case 32:
4689 dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
4690 break;
4691 }
4692 } else {
4693 switch (bh) {
4694 case 4:
4695 dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
4696 break;
4697 case 8:
4698 dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
4699 break;
4700 case 16:
4701 dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
4702 break;
4703 }
4704 }
4705 }
4706 }
4707 }
4708