1 /*
2  * Copyright © 2018, VideoLAN and dav1d authors
3  * Copyright © 2018, Two Orioles, LLC
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "config.h"
29 
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "common/attributes.h"
34 #include "common/intops.h"
35 
36 #include "src/ipred.h"
37 #include "src/tables.h"
38 
39 static NOINLINE void
splat_dc(pixel * dst,const ptrdiff_t stride,const int width,const int height,const int dc HIGHBD_DECL_SUFFIX)40 splat_dc(pixel *dst, const ptrdiff_t stride,
41          const int width, const int height, const int dc HIGHBD_DECL_SUFFIX)
42 {
43 #if BITDEPTH == 8
44     assert(dc <= 0xff);
45     if (width > 4) {
46         const uint64_t dcN = dc * 0x0101010101010101ULL;
47         for (int y = 0; y < height; y++) {
48             for (int x = 0; x < width; x += sizeof(dcN))
49                 *((uint64_t *) &dst[x]) = dcN;
50             dst += PXSTRIDE(stride);
51         }
52     } else {
53         const unsigned dcN = dc * 0x01010101U;
54         for (int y = 0; y < height; y++) {
55             for (int x = 0; x < width; x += sizeof(dcN))
56                 *((unsigned *) &dst[x]) = dcN;
57             dst += PXSTRIDE(stride);
58         }
59     }
60 #else
61     assert(dc <= bitdepth_max);
62     const uint64_t dcN = dc * 0x0001000100010001ULL;
63     for (int y = 0; y < height; y++) {
64         for (int x = 0; x < width; x += sizeof(dcN) >> 1)
65             *((uint64_t *) &dst[x]) = dcN;
66         dst += PXSTRIDE(stride);
67     }
68 #endif
69 }
70 
71 static NOINLINE void
cfl_pred(pixel * dst,const ptrdiff_t stride,const int width,const int height,const int dc,const int16_t * ac,const int alpha HIGHBD_DECL_SUFFIX)72 cfl_pred(pixel *dst, const ptrdiff_t stride,
73          const int width, const int height, const int dc,
74          const int16_t *ac, const int alpha HIGHBD_DECL_SUFFIX)
75 {
76     for (int y = 0; y < height; y++) {
77         for (int x = 0; x < width; x++) {
78             const int diff = alpha * ac[x];
79             dst[x] = iclip_pixel(dc + apply_sign((abs(diff) + 32) >> 6, diff));
80         }
81         ac += width;
82         dst += PXSTRIDE(stride);
83     }
84 }
85 
dc_gen_top(const pixel * const topleft,const int width)86 static unsigned dc_gen_top(const pixel *const topleft, const int width) {
87     unsigned dc = width >> 1;
88     for (int i = 0; i < width; i++)
89        dc += topleft[1 + i];
90     return dc >> ctz(width);
91 }
92 
ipred_dc_top_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)93 static void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
94                            const pixel *const topleft,
95                            const int width, const int height, const int a,
96                            const int max_width, const int max_height
97                            HIGHBD_DECL_SUFFIX)
98 {
99     splat_dc(dst, stride, width, height, dc_gen_top(topleft, width)
100              HIGHBD_TAIL_SUFFIX);
101 }
102 
ipred_cfl_top_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int16_t * ac,const int alpha HIGHBD_DECL_SUFFIX)103 static void ipred_cfl_top_c(pixel *dst, const ptrdiff_t stride,
104                             const pixel *const topleft,
105                             const int width, const int height,
106                             const int16_t *ac, const int alpha
107                             HIGHBD_DECL_SUFFIX)
108 {
109     cfl_pred(dst, stride, width, height, dc_gen_top(topleft, width), ac, alpha
110              HIGHBD_TAIL_SUFFIX);
111 }
112 
dc_gen_left(const pixel * const topleft,const int height)113 static unsigned dc_gen_left(const pixel *const topleft, const int height) {
114     unsigned dc = height >> 1;
115     for (int i = 0; i < height; i++)
116        dc += topleft[-(1 + i)];
117     return dc >> ctz(height);
118 }
119 
ipred_dc_left_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)120 static void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
121                             const pixel *const topleft,
122                             const int width, const int height, const int a,
123                             const int max_width, const int max_height
124                             HIGHBD_DECL_SUFFIX)
125 {
126     splat_dc(dst, stride, width, height, dc_gen_left(topleft, height)
127              HIGHBD_TAIL_SUFFIX);
128 }
129 
ipred_cfl_left_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int16_t * ac,const int alpha HIGHBD_DECL_SUFFIX)130 static void ipred_cfl_left_c(pixel *dst, const ptrdiff_t stride,
131                              const pixel *const topleft,
132                              const int width, const int height,
133                              const int16_t *ac, const int alpha
134                              HIGHBD_DECL_SUFFIX)
135 {
136     const unsigned dc = dc_gen_left(topleft, height);
137     cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
138 }
139 
140 #if BITDEPTH == 8
141 #define MULTIPLIER_1x2 0x5556
142 #define MULTIPLIER_1x4 0x3334
143 #define BASE_SHIFT 16
144 #else
145 #define MULTIPLIER_1x2 0xAAAB
146 #define MULTIPLIER_1x4 0x6667
147 #define BASE_SHIFT 17
148 #endif
149 
dc_gen(const pixel * const topleft,const int width,const int height)150 static unsigned dc_gen(const pixel *const topleft,
151                        const int width, const int height)
152 {
153     unsigned dc = (width + height) >> 1;
154     for (int i = 0; i < width; i++)
155        dc += topleft[i + 1];
156     for (int i = 0; i < height; i++)
157        dc += topleft[-(i + 1)];
158     dc >>= ctz(width + height);
159 
160     if (width != height) {
161         dc *= (width > height * 2 || height > width * 2) ? MULTIPLIER_1x4 :
162                                                            MULTIPLIER_1x2;
163         dc >>= BASE_SHIFT;
164     }
165     return dc;
166 }
167 
ipred_dc_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)168 static void ipred_dc_c(pixel *dst, const ptrdiff_t stride,
169                        const pixel *const topleft,
170                        const int width, const int height, const int a,
171                        const int max_width, const int max_height
172                        HIGHBD_DECL_SUFFIX)
173 {
174     splat_dc(dst, stride, width, height, dc_gen(topleft, width, height)
175              HIGHBD_TAIL_SUFFIX);
176 }
177 
ipred_cfl_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int16_t * ac,const int alpha HIGHBD_DECL_SUFFIX)178 static void ipred_cfl_c(pixel *dst, const ptrdiff_t stride,
179                         const pixel *const topleft,
180                         const int width, const int height,
181                         const int16_t *ac, const int alpha
182                         HIGHBD_DECL_SUFFIX)
183 {
184     unsigned dc = dc_gen(topleft, width, height);
185     cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
186 }
187 
188 #undef MULTIPLIER_1x2
189 #undef MULTIPLIER_1x4
190 #undef BASE_SHIFT
191 
ipred_dc_128_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)192 static void ipred_dc_128_c(pixel *dst, const ptrdiff_t stride,
193                            const pixel *const topleft,
194                            const int width, const int height, const int a,
195                            const int max_width, const int max_height
196                            HIGHBD_DECL_SUFFIX)
197 {
198 #if BITDEPTH == 16
199     const int dc = (bitdepth_max + 1) >> 1;
200 #else
201     const int dc = 128;
202 #endif
203     splat_dc(dst, stride, width, height, dc HIGHBD_TAIL_SUFFIX);
204 }
205 
ipred_cfl_128_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int16_t * ac,const int alpha HIGHBD_DECL_SUFFIX)206 static void ipred_cfl_128_c(pixel *dst, const ptrdiff_t stride,
207                             const pixel *const topleft,
208                             const int width, const int height,
209                             const int16_t *ac, const int alpha
210                             HIGHBD_DECL_SUFFIX)
211 {
212 #if BITDEPTH == 16
213     const int dc = (bitdepth_max + 1) >> 1;
214 #else
215     const int dc = 128;
216 #endif
217     cfl_pred(dst, stride, width, height, dc, ac, alpha HIGHBD_TAIL_SUFFIX);
218 }
219 
ipred_v_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)220 static void ipred_v_c(pixel *dst, const ptrdiff_t stride,
221                       const pixel *const topleft,
222                       const int width, const int height, const int a,
223                       const int max_width, const int max_height
224                       HIGHBD_DECL_SUFFIX)
225 {
226     for (int y = 0; y < height; y++) {
227         pixel_copy(dst, topleft + 1, width);
228         dst += PXSTRIDE(stride);
229     }
230 }
231 
ipred_h_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)232 static void ipred_h_c(pixel *dst, const ptrdiff_t stride,
233                       const pixel *const topleft,
234                       const int width, const int height, const int a,
235                       const int max_width, const int max_height
236                       HIGHBD_DECL_SUFFIX)
237 {
238     for (int y = 0; y < height; y++) {
239         pixel_set(dst, topleft[-(1 + y)], width);
240         dst += PXSTRIDE(stride);
241     }
242 }
243 
ipred_paeth_c(pixel * dst,const ptrdiff_t stride,const pixel * const tl_ptr,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)244 static void ipred_paeth_c(pixel *dst, const ptrdiff_t stride,
245                           const pixel *const tl_ptr,
246                           const int width, const int height, const int a,
247                           const int max_width, const int max_height
248                           HIGHBD_DECL_SUFFIX)
249 {
250     const int topleft = tl_ptr[0];
251     for (int y = 0; y < height; y++) {
252         const int left = tl_ptr[-(y + 1)];
253         for (int x = 0; x < width; x++) {
254             const int top = tl_ptr[1 + x];
255             const int base = left + top - topleft;
256             const int ldiff = abs(left - base);
257             const int tdiff = abs(top - base);
258             const int tldiff = abs(topleft - base);
259 
260             dst[x] = ldiff <= tdiff && ldiff <= tldiff ? left :
261                      tdiff <= tldiff ? top : topleft;
262         }
263         dst += PXSTRIDE(stride);
264     }
265 }
266 
ipred_smooth_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)267 static void ipred_smooth_c(pixel *dst, const ptrdiff_t stride,
268                            const pixel *const topleft,
269                            const int width, const int height, const int a,
270                            const int max_width, const int max_height
271                            HIGHBD_DECL_SUFFIX)
272 {
273     const uint8_t *const weights_hor = &dav1d_sm_weights[width];
274     const uint8_t *const weights_ver = &dav1d_sm_weights[height];
275     const int right = topleft[width], bottom = topleft[-height];
276 
277     for (int y = 0; y < height; y++) {
278         for (int x = 0; x < width; x++) {
279             const int pred = weights_ver[y]  * topleft[1 + x] +
280                       (256 - weights_ver[y]) * bottom +
281                              weights_hor[x]  * topleft[-(1 + y)] +
282                       (256 - weights_hor[x]) * right;
283             dst[x] = (pred + 256) >> 9;
284         }
285         dst += PXSTRIDE(stride);
286     }
287 }
288 
ipred_smooth_v_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)289 static void ipred_smooth_v_c(pixel *dst, const ptrdiff_t stride,
290                              const pixel *const topleft,
291                              const int width, const int height, const int a,
292                              const int max_width, const int max_height
293                              HIGHBD_DECL_SUFFIX)
294 {
295     const uint8_t *const weights_ver = &dav1d_sm_weights[height];
296     const int bottom = topleft[-height];
297 
298     for (int y = 0; y < height; y++) {
299         for (int x = 0; x < width; x++) {
300             const int pred = weights_ver[y]  * topleft[1 + x] +
301                       (256 - weights_ver[y]) * bottom;
302             dst[x] = (pred + 128) >> 8;
303         }
304         dst += PXSTRIDE(stride);
305     }
306 }
307 
ipred_smooth_h_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft,const int width,const int height,const int a,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)308 static void ipred_smooth_h_c(pixel *dst, const ptrdiff_t stride,
309                              const pixel *const topleft,
310                              const int width, const int height, const int a,
311                              const int max_width, const int max_height
312                              HIGHBD_DECL_SUFFIX)
313 {
314     const uint8_t *const weights_hor = &dav1d_sm_weights[width];
315     const int right = topleft[width];
316 
317     for (int y = 0; y < height; y++) {
318         for (int x = 0; x < width; x++) {
319             const int pred = weights_hor[x]  * topleft[-(y + 1)] +
320                       (256 - weights_hor[x]) * right;
321             dst[x] = (pred + 128) >> 8;
322         }
323         dst += PXSTRIDE(stride);
324     }
325 }
326 
get_filter_strength(const int wh,const int angle,const int is_sm)327 static NOINLINE int get_filter_strength(const int wh, const int angle,
328                                         const int is_sm)
329 {
330     if (is_sm) {
331         if (wh <= 8) {
332             if (angle >= 64) return 2;
333             if (angle >= 40) return 1;
334         } else if (wh <= 16) {
335             if (angle >= 48) return 2;
336             if (angle >= 20) return 1;
337         } else if (wh <= 24) {
338             if (angle >=  4) return 3;
339         } else {
340             return 3;
341         }
342     } else {
343         if (wh <= 8) {
344             if (angle >= 56) return 1;
345         } else if (wh <= 16) {
346             if (angle >= 40) return 1;
347         } else if (wh <= 24) {
348             if (angle >= 32) return 3;
349             if (angle >= 16) return 2;
350             if (angle >=  8) return 1;
351         } else if (wh <= 32) {
352             if (angle >= 32) return 3;
353             if (angle >=  4) return 2;
354             return 1;
355         } else {
356             return 3;
357         }
358     }
359     return 0;
360 }
361 
filter_edge(pixel * const out,const int sz,const int lim_from,const int lim_to,const pixel * const in,const int from,const int to,const int strength)362 static NOINLINE void filter_edge(pixel *const out, const int sz,
363                                  const int lim_from, const int lim_to,
364                                  const pixel *const in, const int from,
365                                  const int to, const int strength)
366 {
367     static const uint8_t kernel[3][5] = {
368         { 0, 4, 8, 4, 0 },
369         { 0, 5, 6, 5, 0 },
370         { 2, 4, 4, 4, 2 }
371     };
372 
373     assert(strength > 0);
374     int i = 0;
375     for (; i < imin(sz, lim_from); i++)
376         out[i] = in[iclip(i, from, to - 1)];
377     for (; i < imin(lim_to, sz); i++) {
378         int s = 0;
379         for (int j = 0; j < 5; j++)
380             s += in[iclip(i - 2 + j, from, to - 1)] * kernel[strength - 1][j];
381         out[i] = (s + 8) >> 4;
382     }
383     for (; i < sz; i++)
384         out[i] = in[iclip(i, from, to - 1)];
385 }
386 
get_upsample(const int wh,const int angle,const int is_sm)387 static inline int get_upsample(const int wh, const int angle, const int is_sm) {
388     return angle < 40 && wh <= 16 >> is_sm;
389 }
390 
upsample_edge(pixel * const out,const int hsz,const pixel * const in,const int from,const int to HIGHBD_DECL_SUFFIX)391 static NOINLINE void upsample_edge(pixel *const out, const int hsz,
392                                    const pixel *const in, const int from,
393                                    const int to HIGHBD_DECL_SUFFIX)
394 {
395     static const int8_t kernel[4] = { -1, 9, 9, -1 };
396     int i;
397     for (i = 0; i < hsz - 1; i++) {
398         out[i * 2] = in[iclip(i, from, to - 1)];
399 
400         int s = 0;
401         for (int j = 0; j < 4; j++)
402             s += in[iclip(i + j - 1, from, to - 1)] * kernel[j];
403         out[i * 2 + 1] = iclip_pixel((s + 8) >> 4);
404     }
405     out[i * 2] = in[iclip(i, from, to - 1)];
406 }
407 
ipred_z1_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft_in,const int width,const int height,int angle,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)408 static void ipred_z1_c(pixel *dst, const ptrdiff_t stride,
409                        const pixel *const topleft_in,
410                        const int width, const int height, int angle,
411                        const int max_width, const int max_height
412                        HIGHBD_DECL_SUFFIX)
413 {
414     const int is_sm = (angle >> 9) & 0x1;
415     const int enable_intra_edge_filter = angle >> 10;
416     angle &= 511;
417     assert(angle < 90);
418     int dx = dav1d_dr_intra_derivative[angle >> 1];
419     pixel top_out[64 + 64];
420     const pixel *top;
421     int max_base_x;
422     const int upsample_above = enable_intra_edge_filter ?
423         get_upsample(width + height, 90 - angle, is_sm) : 0;
424     if (upsample_above) {
425         upsample_edge(top_out, width + height, &topleft_in[1], -1,
426                       width + imin(width, height) HIGHBD_TAIL_SUFFIX);
427         top = top_out;
428         max_base_x = 2 * (width + height) - 2;
429         dx <<= 1;
430     } else {
431         const int filter_strength = enable_intra_edge_filter ?
432             get_filter_strength(width + height, 90 - angle, is_sm) : 0;
433         if (filter_strength) {
434             filter_edge(top_out, width + height, 0, width + height,
435                         &topleft_in[1], -1, width + imin(width, height),
436                         filter_strength);
437             top = top_out;
438             max_base_x = width + height - 1;
439         } else {
440             top = &topleft_in[1];
441             max_base_x = width + imin(width, height) - 1;
442         }
443     }
444     const int base_inc = 1 + upsample_above;
445     for (int y = 0, xpos = dx; y < height;
446          y++, dst += PXSTRIDE(stride), xpos += dx)
447     {
448         const int frac = xpos & 0x3E;
449 
450         for (int x = 0, base = xpos >> 6; x < width; x++, base += base_inc) {
451             if (base < max_base_x) {
452                 const int v = top[base] * (64 - frac) + top[base + 1] * frac;
453                 dst[x] = (v + 32) >> 6;
454             } else {
455                 pixel_set(&dst[x], top[max_base_x], width - x);
456                 break;
457             }
458         }
459     }
460 }
461 
ipred_z2_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft_in,const int width,const int height,int angle,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)462 static void ipred_z2_c(pixel *dst, const ptrdiff_t stride,
463                        const pixel *const topleft_in,
464                        const int width, const int height, int angle,
465                        const int max_width, const int max_height
466                        HIGHBD_DECL_SUFFIX)
467 {
468     const int is_sm = (angle >> 9) & 0x1;
469     const int enable_intra_edge_filter = angle >> 10;
470     angle &= 511;
471     assert(angle > 90 && angle < 180);
472     int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1];
473     int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1];
474     const int upsample_left = enable_intra_edge_filter ?
475         get_upsample(width + height, 180 - angle, is_sm) : 0;
476     const int upsample_above = enable_intra_edge_filter ?
477         get_upsample(width + height, angle - 90, is_sm) : 0;
478     pixel edge[64 + 64 + 1];
479     pixel *const topleft = &edge[64];
480 
481     if (upsample_above) {
482         upsample_edge(topleft, width + 1, topleft_in, 0, width + 1
483                       HIGHBD_TAIL_SUFFIX);
484         dx <<= 1;
485     } else {
486         const int filter_strength = enable_intra_edge_filter ?
487             get_filter_strength(width + height, angle - 90, is_sm) : 0;
488 
489         if (filter_strength) {
490             filter_edge(&topleft[1], width, 0, max_width,
491                         &topleft_in[1], -1, width,
492                         filter_strength);
493         } else {
494             pixel_copy(&topleft[1], &topleft_in[1], width);
495         }
496     }
497     if (upsample_left) {
498         upsample_edge(&topleft[-height * 2], height + 1, &topleft_in[-height],
499                       0, height + 1 HIGHBD_TAIL_SUFFIX);
500         dy <<= 1;
501     } else {
502         const int filter_strength = enable_intra_edge_filter ?
503             get_filter_strength(width + height, 180 - angle, is_sm) : 0;
504 
505         if (filter_strength) {
506             filter_edge(&topleft[-height], height, height - max_height, height,
507                         &topleft_in[-height],
508                         0, height + 1, filter_strength);
509         } else {
510             pixel_copy(&topleft[-height], &topleft_in[-height], height);
511         }
512     }
513     *topleft = *topleft_in;
514 
515     const int base_inc_x = 1 + upsample_above;
516     const pixel *const left = &topleft[-(1 + upsample_left)];
517     for (int y = 0, xpos = ((1 + upsample_above) << 6) - dx; y < height;
518          y++, xpos -= dx, dst += PXSTRIDE(stride))
519     {
520         int base_x = xpos >> 6;
521         const int frac_x = xpos & 0x3E;
522 
523         for (int x = 0, ypos = (y << (6 + upsample_left)) - dy; x < width;
524              x++, base_x += base_inc_x, ypos -= dy)
525         {
526             int v;
527             if (base_x >= 0) {
528                 v = topleft[base_x] * (64 - frac_x) +
529                     topleft[base_x + 1] * frac_x;
530             } else {
531                 const int base_y = ypos >> 6;
532                 assert(base_y >= -(1 + upsample_left));
533                 const int frac_y = ypos & 0x3E;
534                 v = left[-base_y] * (64 - frac_y) +
535                     left[-(base_y + 1)] * frac_y;
536             }
537             dst[x] = (v + 32) >> 6;
538         }
539     }
540 }
541 
ipred_z3_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft_in,const int width,const int height,int angle,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)542 static void ipred_z3_c(pixel *dst, const ptrdiff_t stride,
543                        const pixel *const topleft_in,
544                        const int width, const int height, int angle,
545                        const int max_width, const int max_height
546                        HIGHBD_DECL_SUFFIX)
547 {
548     const int is_sm = (angle >> 9) & 0x1;
549     const int enable_intra_edge_filter = angle >> 10;
550     angle &= 511;
551     assert(angle > 180);
552     int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1];
553     pixel left_out[64 + 64];
554     const pixel *left;
555     int max_base_y;
556     const int upsample_left = enable_intra_edge_filter ?
557         get_upsample(width + height, angle - 180, is_sm) : 0;
558     if (upsample_left) {
559         upsample_edge(left_out, width + height,
560                       &topleft_in[-(width + height)],
561                       imax(width - height, 0), width + height + 1
562                       HIGHBD_TAIL_SUFFIX);
563         left = &left_out[2 * (width + height) - 2];
564         max_base_y = 2 * (width + height) - 2;
565         dy <<= 1;
566     } else {
567         const int filter_strength = enable_intra_edge_filter ?
568             get_filter_strength(width + height, angle - 180, is_sm) : 0;
569 
570         if (filter_strength) {
571             filter_edge(left_out, width + height, 0, width + height,
572                         &topleft_in[-(width + height)],
573                         imax(width - height, 0), width + height + 1,
574                         filter_strength);
575             left = &left_out[width + height - 1];
576             max_base_y = width + height - 1;
577         } else {
578             left = &topleft_in[-1];
579             max_base_y = height + imin(width, height) - 1;
580         }
581     }
582     const int base_inc = 1 + upsample_left;
583     for (int x = 0, ypos = dy; x < width; x++, ypos += dy) {
584         const int frac = ypos & 0x3E;
585 
586         for (int y = 0, base = ypos >> 6; y < height; y++, base += base_inc) {
587             if (base < max_base_y) {
588                 const int v = left[-base] * (64 - frac) +
589                               left[-(base + 1)] * frac;
590                 dst[y * PXSTRIDE(stride) + x] = (v + 32) >> 6;
591             } else {
592                 do {
593                     dst[y * PXSTRIDE(stride) + x] = left[-max_base_y];
594                 } while (++y < height);
595                 break;
596             }
597         }
598     }
599 }
600 
601 #if ARCH_X86
602 #define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
603     flt_ptr[ 0] * p0 + flt_ptr[ 1] * p1 +           \
604     flt_ptr[16] * p2 + flt_ptr[17] * p3 +           \
605     flt_ptr[32] * p4 + flt_ptr[33] * p5 +           \
606     flt_ptr[48] * p6
607 #define FLT_INCR 2
608 #else
609 #define FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6) \
610     flt_ptr[ 0] * p0 + flt_ptr[ 8] * p1 +           \
611     flt_ptr[16] * p2 + flt_ptr[24] * p3 +           \
612     flt_ptr[32] * p4 + flt_ptr[40] * p5 +           \
613     flt_ptr[48] * p6
614 #define FLT_INCR 1
615 #endif
616 
617 /* Up to 32x32 only */
ipred_filter_c(pixel * dst,const ptrdiff_t stride,const pixel * const topleft_in,const int width,const int height,int filt_idx,const int max_width,const int max_height HIGHBD_DECL_SUFFIX)618 static void ipred_filter_c(pixel *dst, const ptrdiff_t stride,
619                            const pixel *const topleft_in,
620                            const int width, const int height, int filt_idx,
621                            const int max_width, const int max_height
622                            HIGHBD_DECL_SUFFIX)
623 {
624     filt_idx &= 511;
625     assert(filt_idx < 5);
626 
627     const int8_t *const filter = dav1d_filter_intra_taps[filt_idx];
628     const pixel *top = &topleft_in[1];
629     for (int y = 0; y < height; y += 2) {
630         const pixel *topleft = &topleft_in[-y];
631         const pixel *left = &topleft[-1];
632         ptrdiff_t left_stride = -1;
633         for (int x = 0; x < width; x += 4) {
634             const int p0 = *topleft;
635             const int p1 = top[0], p2 = top[1], p3 = top[2], p4 = top[3];
636             const int p5 = left[0 * left_stride], p6 = left[1 * left_stride];
637             pixel *ptr = &dst[x];
638             const int8_t *flt_ptr = filter;
639 
640             for (int yy = 0; yy < 2; yy++) {
641                 for (int xx = 0; xx < 4; xx++, flt_ptr += FLT_INCR) {
642                     const int acc = FILTER(flt_ptr, p0, p1, p2, p3, p4, p5, p6);
643                     ptr[xx] = iclip_pixel((acc + 8) >> 4);
644                 }
645                 ptr += PXSTRIDE(stride);
646             }
647             left = &dst[x + 4 - 1];
648             left_stride = PXSTRIDE(stride);
649             top += 4;
650             topleft = &top[-1];
651         }
652         top = &dst[PXSTRIDE(stride)];
653         dst = &dst[PXSTRIDE(stride) * 2];
654     }
655 }
656 
657 static NOINLINE void
cfl_ac_c(int16_t * ac,const pixel * ypx,const ptrdiff_t stride,const int w_pad,const int h_pad,const int width,const int height,const int ss_hor,const int ss_ver)658 cfl_ac_c(int16_t *ac, const pixel *ypx, const ptrdiff_t stride,
659          const int w_pad, const int h_pad, const int width, const int height,
660          const int ss_hor, const int ss_ver)
661 {
662     int y, x;
663     int16_t *const ac_orig = ac;
664 
665     assert(w_pad >= 0 && w_pad * 4 < width);
666     assert(h_pad >= 0 && h_pad * 4 < height);
667 
668     for (y = 0; y < height - 4 * h_pad; y++) {
669         for (x = 0; x < width - 4 * w_pad; x++) {
670             int ac_sum = ypx[x << ss_hor];
671             if (ss_hor) ac_sum += ypx[x * 2 + 1];
672             if (ss_ver) {
673                 ac_sum += ypx[(x << ss_hor) + PXSTRIDE(stride)];
674                 if (ss_hor) ac_sum += ypx[x * 2 + 1 + PXSTRIDE(stride)];
675             }
676             ac[x] = ac_sum << (1 + !ss_ver + !ss_hor);
677         }
678         for (; x < width; x++)
679             ac[x] = ac[x - 1];
680         ac += width;
681         ypx += PXSTRIDE(stride) << ss_ver;
682     }
683     for (; y < height; y++) {
684         memcpy(ac, &ac[-width], width * sizeof(*ac));
685         ac += width;
686     }
687 
688     const int log2sz = ctz(width) + ctz(height);
689     int sum = (1 << log2sz) >> 1;
690     for (ac = ac_orig, y = 0; y < height; y++) {
691         for (x = 0; x < width; x++)
692             sum += ac[x];
693         ac += width;
694     }
695     sum >>= log2sz;
696 
697     // subtract DC
698     for (ac = ac_orig, y = 0; y < height; y++) {
699         for (x = 0; x < width; x++)
700             ac[x] -= sum;
701         ac += width;
702     }
703 }
704 
705 #define cfl_ac_fn(fmt, ss_hor, ss_ver) \
706 static void cfl_ac_##fmt##_c(int16_t *const ac, const pixel *const ypx, \
707                              const ptrdiff_t stride, const int w_pad, \
708                              const int h_pad, const int cw, const int ch) \
709 { \
710     cfl_ac_c(ac, ypx, stride, w_pad, h_pad, cw, ch, ss_hor, ss_ver); \
711 }
712 
713 cfl_ac_fn(420, 1, 1)
714 cfl_ac_fn(422, 1, 0)
715 cfl_ac_fn(444, 0, 0)
716 
pal_pred_c(pixel * dst,const ptrdiff_t stride,const pixel * const pal,const uint8_t * idx,const int w,const int h)717 static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
718                        const pixel *const pal, const uint8_t *idx,
719                        const int w, const int h)
720 {
721     for (int y = 0; y < h; y++) {
722         for (int x = 0; x < w; x += 2) {
723             const int i = *idx++;
724             assert(!(i & 0x88));
725             dst[x + 0] = pal[i & 7];
726             dst[x + 1] = pal[i >> 4];
727         }
728         dst += PXSTRIDE(stride);
729     }
730 }
731 
732 #if HAVE_ASM
733 #if ARCH_AARCH64 || ARCH_ARM
734 #include "src/arm/ipred.h"
735 #elif ARCH_RISCV
736 #include "src/riscv/ipred.h"
737 #elif ARCH_X86
738 #include "src/x86/ipred.h"
739 #elif ARCH_LOONGARCH64
740 #include "src/loongarch/ipred.h"
741 #endif
742 #endif
743 
bitfn(dav1d_intra_pred_dsp_init)744 COLD void bitfn(dav1d_intra_pred_dsp_init)(Dav1dIntraPredDSPContext *const c) {
745     c->intra_pred[DC_PRED      ] = ipred_dc_c;
746     c->intra_pred[DC_128_PRED  ] = ipred_dc_128_c;
747     c->intra_pred[TOP_DC_PRED  ] = ipred_dc_top_c;
748     c->intra_pred[LEFT_DC_PRED ] = ipred_dc_left_c;
749     c->intra_pred[HOR_PRED     ] = ipred_h_c;
750     c->intra_pred[VERT_PRED    ] = ipred_v_c;
751     c->intra_pred[PAETH_PRED   ] = ipred_paeth_c;
752     c->intra_pred[SMOOTH_PRED  ] = ipred_smooth_c;
753     c->intra_pred[SMOOTH_V_PRED] = ipred_smooth_v_c;
754     c->intra_pred[SMOOTH_H_PRED] = ipred_smooth_h_c;
755     c->intra_pred[Z1_PRED      ] = ipred_z1_c;
756     c->intra_pred[Z2_PRED      ] = ipred_z2_c;
757     c->intra_pred[Z3_PRED      ] = ipred_z3_c;
758     c->intra_pred[FILTER_PRED  ] = ipred_filter_c;
759 
760     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = cfl_ac_420_c;
761     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = cfl_ac_422_c;
762     c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = cfl_ac_444_c;
763 
764     c->cfl_pred[DC_PRED     ] = ipred_cfl_c;
765     c->cfl_pred[DC_128_PRED ] = ipred_cfl_128_c;
766     c->cfl_pred[TOP_DC_PRED ] = ipred_cfl_top_c;
767     c->cfl_pred[LEFT_DC_PRED] = ipred_cfl_left_c;
768 
769     c->pal_pred = pal_pred_c;
770 
771 #if HAVE_ASM
772 #if ARCH_AARCH64 || ARCH_ARM
773     intra_pred_dsp_init_arm(c);
774 #elif ARCH_RISCV
775     intra_pred_dsp_init_riscv(c);
776 #elif ARCH_X86
777     intra_pred_dsp_init_x86(c);
778 #elif ARCH_LOONGARCH64
779     intra_pred_dsp_init_loongarch(c);
780 #endif
781 #endif
782 }
783