• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright © 2018-2021, VideoLAN and dav1d authors
3  * Copyright © 2018, Two Orioles, LLC
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  *    list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  *    this list of conditions and the following disclaimer in the documentation
14  *    and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include "config.h"
29 
30 #include <string.h>
31 #include <stdio.h>
32 
33 #include "common/attributes.h"
34 #include "common/bitdepth.h"
35 #include "common/dump.h"
36 #include "common/frame.h"
37 #include "common/intops.h"
38 
39 #include "src/cdef_apply.h"
40 #include "src/ctx.h"
41 #include "src/ipred_prepare.h"
42 #include "src/lf_apply.h"
43 #include "src/lr_apply.h"
44 #include "src/recon.h"
45 #include "src/scan.h"
46 #include "src/tables.h"
47 #include "src/wedge.h"
48 
read_golomb(MsacContext * const msac)49 static inline unsigned read_golomb(MsacContext *const msac) {
50     int len = 0;
51     unsigned val = 1;
52 
53     while (!dav1d_msac_decode_bool_equi(msac) && len < 32) len++;
54     while (len--) val = (val << 1) + dav1d_msac_decode_bool_equi(msac);
55 
56     return val - 1;
57 }
58 
get_skip_ctx(const TxfmInfo * const t_dim,const enum BlockSize bs,const uint8_t * const a,const uint8_t * const l,const int chroma,const enum Dav1dPixelLayout layout)59 static inline unsigned get_skip_ctx(const TxfmInfo *const t_dim,
60                                     const enum BlockSize bs,
61                                     const uint8_t *const a,
62                                     const uint8_t *const l,
63                                     const int chroma,
64                                     const enum Dav1dPixelLayout layout)
65 {
66     const uint8_t *const b_dim = dav1d_block_dimensions[bs];
67 
68     if (chroma) {
69         const int ss_ver = layout == DAV1D_PIXEL_LAYOUT_I420;
70         const int ss_hor = layout != DAV1D_PIXEL_LAYOUT_I444;
71         const int not_one_blk = b_dim[2] - (!!b_dim[2] && ss_hor) > t_dim->lw ||
72                                 b_dim[3] - (!!b_dim[3] && ss_ver) > t_dim->lh;
73         unsigned ca, cl;
74 
75 #define MERGE_CTX(dir, type, no_val) \
76         c##dir = *(const type *) dir != no_val; \
77         break
78 
79         switch (t_dim->lw) {
80         /* For some reason the MSVC CRT _wassert() function is not flagged as
81          * __declspec(noreturn), so when using those headers the compiler will
82          * expect execution to continue after an assertion has been triggered
83          * and will therefore complain about the use of uninitialized variables
84          * when compiled in debug mode if we put the default case at the end. */
85         default: assert(0); /* fall-through */
86         case TX_4X4:   MERGE_CTX(a, uint8_t,  0x40);
87         case TX_8X8:   MERGE_CTX(a, uint16_t, 0x4040);
88         case TX_16X16: MERGE_CTX(a, uint32_t, 0x40404040U);
89         case TX_32X32: MERGE_CTX(a, uint64_t, 0x4040404040404040ULL);
90         }
91         switch (t_dim->lh) {
92         default: assert(0); /* fall-through */
93         case TX_4X4:   MERGE_CTX(l, uint8_t,  0x40);
94         case TX_8X8:   MERGE_CTX(l, uint16_t, 0x4040);
95         case TX_16X16: MERGE_CTX(l, uint32_t, 0x40404040U);
96         case TX_32X32: MERGE_CTX(l, uint64_t, 0x4040404040404040ULL);
97         }
98 #undef MERGE_CTX
99 
100         return 7 + not_one_blk * 3 + ca + cl;
101     } else if (b_dim[2] == t_dim->lw && b_dim[3] == t_dim->lh) {
102         return 0;
103     } else {
104         unsigned la, ll;
105 
106 #define MERGE_CTX(dir, type, tx) \
107         if (tx == TX_64X64) { \
108             uint64_t tmp = *(const uint64_t *) dir; \
109             tmp |= *(const uint64_t *) &dir[8]; \
110             l##dir = (unsigned) (tmp >> 32) | (unsigned) tmp; \
111         } else \
112             l##dir = *(const type *) dir; \
113         if (tx == TX_32X32) l##dir |= *(const type *) &dir[sizeof(type)]; \
114         if (tx >= TX_16X16) l##dir |= l##dir >> 16; \
115         if (tx >= TX_8X8)   l##dir |= l##dir >> 8; \
116         break
117 
118         switch (t_dim->lw) {
119         default: assert(0); /* fall-through */
120         case TX_4X4:   MERGE_CTX(a, uint8_t,  TX_4X4);
121         case TX_8X8:   MERGE_CTX(a, uint16_t, TX_8X8);
122         case TX_16X16: MERGE_CTX(a, uint32_t, TX_16X16);
123         case TX_32X32: MERGE_CTX(a, uint32_t, TX_32X32);
124         case TX_64X64: MERGE_CTX(a, uint32_t, TX_64X64);
125         }
126         switch (t_dim->lh) {
127         default: assert(0); /* fall-through */
128         case TX_4X4:   MERGE_CTX(l, uint8_t,  TX_4X4);
129         case TX_8X8:   MERGE_CTX(l, uint16_t, TX_8X8);
130         case TX_16X16: MERGE_CTX(l, uint32_t, TX_16X16);
131         case TX_32X32: MERGE_CTX(l, uint32_t, TX_32X32);
132         case TX_64X64: MERGE_CTX(l, uint32_t, TX_64X64);
133         }
134 #undef MERGE_CTX
135 
136         return dav1d_skip_ctx[umin(la & 0x3F, 4)][umin(ll & 0x3F, 4)];
137     }
138 }
139 
get_dc_sign_ctx(const int tx,const uint8_t * const a,const uint8_t * const l)140 static inline unsigned get_dc_sign_ctx(const int /*enum RectTxfmSize*/ tx,
141                                        const uint8_t *const a,
142                                        const uint8_t *const l)
143 {
144     uint64_t mask = 0xC0C0C0C0C0C0C0C0ULL, mul = 0x0101010101010101ULL;
145     int s;
146 
147 #if ARCH_X86_64 && defined(__GNUC__)
148     /* Coerce compilers into producing better code. For some reason
149      * every x86-64 compiler is awful at handling 64-bit constants. */
150     __asm__("" : "+r"(mask), "+r"(mul));
151 #endif
152 
153     switch(tx) {
154     default: assert(0); /* fall-through */
155     case TX_4X4: {
156         int t = *(const uint8_t *) a >> 6;
157         t    += *(const uint8_t *) l >> 6;
158         s = t - 1 - 1;
159         break;
160     }
161     case TX_8X8: {
162         uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
163         t         += *(const uint16_t *) l & (uint32_t) mask;
164         t *= 0x04040404U;
165         s = (int) (t >> 24) - 2 - 2;
166         break;
167     }
168     case TX_16X16: {
169         uint32_t t = (*(const uint32_t *) a & (uint32_t) mask) >> 6;
170         t         += (*(const uint32_t *) l & (uint32_t) mask) >> 6;
171         t *= (uint32_t) mul;
172         s = (int) (t >> 24) - 4 - 4;
173         break;
174     }
175     case TX_32X32: {
176         uint64_t t = (*(const uint64_t *) a & mask) >> 6;
177         t         += (*(const uint64_t *) l & mask) >> 6;
178         t *= mul;
179         s = (int) (t >> 56) - 8 - 8;
180         break;
181     }
182     case TX_64X64: {
183         uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
184         t         += (*(const uint64_t *) &a[8] & mask) >> 6;
185         t         += (*(const uint64_t *) &l[0] & mask) >> 6;
186         t         += (*(const uint64_t *) &l[8] & mask) >> 6;
187         t *= mul;
188         s = (int) (t >> 56) - 16 - 16;
189         break;
190     }
191     case RTX_4X8: {
192         uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
193         t         += *(const uint16_t *) l & (uint32_t) mask;
194         t *= 0x04040404U;
195         s = (int) (t >> 24) - 1 - 2;
196         break;
197     }
198     case RTX_8X4: {
199         uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
200         t         += *(const uint8_t  *) l & (uint32_t) mask;
201         t *= 0x04040404U;
202         s = (int) (t >> 24) - 2 - 1;
203         break;
204     }
205     case RTX_8X16: {
206         uint32_t t = *(const uint16_t *) a & (uint32_t) mask;
207         t         += *(const uint32_t *) l & (uint32_t) mask;
208         t = (t >> 6) * (uint32_t) mul;
209         s = (int) (t >> 24) - 2 - 4;
210         break;
211     }
212     case RTX_16X8: {
213         uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
214         t         += *(const uint16_t *) l & (uint32_t) mask;
215         t = (t >> 6) * (uint32_t) mul;
216         s = (int) (t >> 24) - 4 - 2;
217         break;
218     }
219     case RTX_16X32: {
220         uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
221         t         += *(const uint64_t *) l & mask;
222         t = (t >> 6) * mul;
223         s = (int) (t >> 56) - 4 - 8;
224         break;
225     }
226     case RTX_32X16: {
227         uint64_t t = *(const uint64_t *) a & mask;
228         t         += *(const uint32_t *) l & (uint32_t) mask;
229         t = (t >> 6) * mul;
230         s = (int) (t >> 56) - 8 - 4;
231         break;
232     }
233     case RTX_32X64: {
234         uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
235         t         += (*(const uint64_t *) &l[0] & mask) >> 6;
236         t         += (*(const uint64_t *) &l[8] & mask) >> 6;
237         t *= mul;
238         s = (int) (t >> 56) - 8 - 16;
239         break;
240     }
241     case RTX_64X32: {
242         uint64_t t = (*(const uint64_t *) &a[0] & mask) >> 6;
243         t         += (*(const uint64_t *) &a[8] & mask) >> 6;
244         t         += (*(const uint64_t *) &l[0] & mask) >> 6;
245         t *= mul;
246         s = (int) (t >> 56) - 16 - 8;
247         break;
248     }
249     case RTX_4X16: {
250         uint32_t t = *(const uint8_t  *) a & (uint32_t) mask;
251         t         += *(const uint32_t *) l & (uint32_t) mask;
252         t = (t >> 6) * (uint32_t) mul;
253         s = (int) (t >> 24) - 1 - 4;
254         break;
255     }
256     case RTX_16X4: {
257         uint32_t t = *(const uint32_t *) a & (uint32_t) mask;
258         t         += *(const uint8_t  *) l & (uint32_t) mask;
259         t = (t >> 6) * (uint32_t) mul;
260         s = (int) (t >> 24) - 4 - 1;
261         break;
262     }
263     case RTX_8X32: {
264         uint64_t t = *(const uint16_t *) a & (uint32_t) mask;
265         t         += *(const uint64_t *) l & mask;
266         t = (t >> 6) * mul;
267         s = (int) (t >> 56) - 2 - 8;
268         break;
269     }
270     case RTX_32X8: {
271         uint64_t t = *(const uint64_t *) a & mask;
272         t         += *(const uint16_t *) l & (uint32_t) mask;
273         t = (t >> 6) * mul;
274         s = (int) (t >> 56) - 8 - 2;
275         break;
276     }
277     case RTX_16X64: {
278         uint64_t t = *(const uint32_t *) a & (uint32_t) mask;
279         t         += *(const uint64_t *) &l[0] & mask;
280         t = (t >> 6) + ((*(const uint64_t *) &l[8] & mask) >> 6);
281         t *= mul;
282         s = (int) (t >> 56) - 4 - 16;
283         break;
284     }
285     case RTX_64X16: {
286         uint64_t t = *(const uint64_t *) &a[0] & mask;
287         t         += *(const uint32_t *) l & (uint32_t) mask;
288         t = (t >> 6) + ((*(const uint64_t *) &a[8] & mask) >> 6);
289         t *= mul;
290         s = (int) (t >> 56) - 16 - 4;
291         break;
292     }
293     }
294 
295     return (s != 0) + (s > 0);
296 }
297 
get_lo_ctx(const uint8_t * const levels,const enum TxClass tx_class,unsigned * const hi_mag,const uint8_t (* const ctx_offsets)[5],const unsigned x,const unsigned y,const ptrdiff_t stride)298 static inline unsigned get_lo_ctx(const uint8_t *const levels,
299                                   const enum TxClass tx_class,
300                                   unsigned *const hi_mag,
301                                   const uint8_t (*const ctx_offsets)[5],
302                                   const unsigned x, const unsigned y,
303                                   const ptrdiff_t stride)
304 {
305     unsigned mag = levels[0 * stride + 1] + levels[1 * stride + 0];
306     unsigned offset;
307     if (tx_class == TX_CLASS_2D) {
308         mag += levels[1 * stride + 1];
309         *hi_mag = mag;
310         mag += levels[0 * stride + 2] + levels[2 * stride + 0];
311         offset = ctx_offsets[umin(y, 4)][umin(x, 4)];
312     } else {
313         mag += levels[0 * stride + 2];
314         *hi_mag = mag;
315         mag += levels[0 * stride + 3] + levels[0 * stride + 4];
316         offset = 26 + (y > 1 ? 10 : y * 5);
317     }
318     return offset + (mag > 512 ? 4 : (mag + 64) >> 7);
319 }
320 
decode_coefs(Dav1dTaskContext * const t,uint8_t * const a,uint8_t * const l,const enum RectTxfmSize tx,const enum BlockSize bs,const Av1Block * const b,const int intra,const int plane,coef * cf,enum TxfmType * const txtp,uint8_t * res_ctx)321 static int decode_coefs(Dav1dTaskContext *const t,
322                         uint8_t *const a, uint8_t *const l,
323                         const enum RectTxfmSize tx, const enum BlockSize bs,
324                         const Av1Block *const b, const int intra,
325                         const int plane, coef *cf,
326                         enum TxfmType *const txtp, uint8_t *res_ctx)
327 {
328     Dav1dTileState *const ts = t->ts;
329     const int chroma = !!plane;
330     const Dav1dFrameContext *const f = t->f;
331     const int lossless = f->frame_hdr->segmentation.lossless[b->seg_id];
332     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[tx];
333     const int dbg = DEBUG_BLOCK_INFO && plane && 0;
334 
335     if (dbg)
336         printf("Start: r=%d\n", ts->msac.rng);
337 
338     // does this block have any non-zero coefficients
339     const int sctx = get_skip_ctx(t_dim, bs, a, l, chroma, f->cur.p.layout);
340     const int all_skip = dav1d_msac_decode_bool_adapt(&ts->msac,
341                              ts->cdf.coef.skip[t_dim->ctx][sctx]);
342     if (dbg)
343         printf("Post-non-zero[%d][%d][%d]: r=%d\n",
344                t_dim->ctx, sctx, all_skip, ts->msac.rng);
345     if (all_skip) {
346         *res_ctx = 0x40;
347         *txtp = lossless * WHT_WHT; /* lossless ? WHT_WHT : DCT_DCT */
348         return -1;
349     }
350 
351     // transform type (chroma: derived, luma: explicitly coded)
352     if (lossless) {
353         assert(t_dim->max == TX_4X4);
354         *txtp = WHT_WHT;
355     } else if (t_dim->max + intra >= TX_64X64) {
356         *txtp = DCT_DCT;
357     } else if (chroma) {
358         // inferred from either the luma txtp (inter) or a LUT (intra)
359         *txtp = intra ? dav1d_txtp_from_uvmode[b->uv_mode] :
360                         get_uv_inter_txtp(t_dim, *txtp);
361     } else if (!f->frame_hdr->segmentation.qidx[b->seg_id]) {
362         // In libaom, lossless is checked by a literal qidx == 0, but not all
363         // such blocks are actually lossless. The remainder gets an implicit
364         // transform type (for luma)
365         *txtp = DCT_DCT;
366     } else {
367         unsigned idx;
368         if (intra) {
369             const enum IntraPredMode y_mode_nofilt = b->y_mode == FILTER_PRED ?
370                 dav1d_filter_mode_to_y_mode[b->y_angle] : b->y_mode;
371             if (f->frame_hdr->reduced_txtp_set || t_dim->min == TX_16X16) {
372                 idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
373                           ts->cdf.m.txtp_intra2[t_dim->min][y_mode_nofilt], 4);
374                 *txtp = dav1d_tx_types_per_set[idx + 0];
375             } else {
376                 idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
377                           ts->cdf.m.txtp_intra1[t_dim->min][y_mode_nofilt], 6);
378                 *txtp = dav1d_tx_types_per_set[idx + 5];
379             }
380             if (dbg)
381                 printf("Post-txtp-intra[%d->%d][%d][%d->%d]: r=%d\n",
382                        tx, t_dim->min, y_mode_nofilt, idx, *txtp, ts->msac.rng);
383         } else {
384             if (f->frame_hdr->reduced_txtp_set || t_dim->max == TX_32X32) {
385                 idx = dav1d_msac_decode_bool_adapt(&ts->msac,
386                           ts->cdf.m.txtp_inter3[t_dim->min]);
387                 *txtp = (idx - 1) & IDTX; /* idx ? DCT_DCT : IDTX */
388             } else if (t_dim->min == TX_16X16) {
389                 idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
390                           ts->cdf.m.txtp_inter2, 11);
391                 *txtp = dav1d_tx_types_per_set[idx + 12];
392             } else {
393                 idx = dav1d_msac_decode_symbol_adapt16(&ts->msac,
394                           ts->cdf.m.txtp_inter1[t_dim->min], 15);
395                 *txtp = dav1d_tx_types_per_set[idx + 24];
396             }
397             if (dbg)
398                 printf("Post-txtp-inter[%d->%d][%d->%d]: r=%d\n",
399                        tx, t_dim->min, idx, *txtp, ts->msac.rng);
400         }
401     }
402 
403     // find end-of-block (eob)
404     int eob_bin;
405     const int tx2dszctx = imin(t_dim->lw, TX_32X32) + imin(t_dim->lh, TX_32X32);
406     const enum TxClass tx_class = dav1d_tx_type_class[*txtp];
407     const int is_1d = tx_class != TX_CLASS_2D;
408     switch (tx2dszctx) {
409 #define case_sz(sz, bin, ns, is_1d) \
410     case sz: { \
411         uint16_t *const eob_bin_cdf = ts->cdf.coef.eob_bin_##bin[chroma]is_1d; \
412         eob_bin = dav1d_msac_decode_symbol_adapt##ns(&ts->msac, eob_bin_cdf, 4 + sz); \
413         break; \
414     }
415     case_sz(0,   16,  8, [is_1d]);
416     case_sz(1,   32,  8, [is_1d]);
417     case_sz(2,   64,  8, [is_1d]);
418     case_sz(3,  128,  8, [is_1d]);
419     case_sz(4,  256, 16, [is_1d]);
420     case_sz(5,  512, 16,        );
421     case_sz(6, 1024, 16,        );
422 #undef case_sz
423     }
424     if (dbg)
425         printf("Post-eob_bin_%d[%d][%d][%d]: r=%d\n",
426                16 << tx2dszctx, chroma, is_1d, eob_bin, ts->msac.rng);
427     int eob;
428     if (eob_bin > 1) {
429         uint16_t *const eob_hi_bit_cdf =
430             ts->cdf.coef.eob_hi_bit[t_dim->ctx][chroma][eob_bin];
431         const int eob_hi_bit = dav1d_msac_decode_bool_adapt(&ts->msac, eob_hi_bit_cdf);
432         if (dbg)
433             printf("Post-eob_hi_bit[%d][%d][%d][%d]: r=%d\n",
434                    t_dim->ctx, chroma, eob_bin, eob_hi_bit, ts->msac.rng);
435         eob = ((eob_hi_bit | 2) << (eob_bin - 2)) |
436               dav1d_msac_decode_bools(&ts->msac, eob_bin - 2);
437         if (dbg)
438             printf("Post-eob[%d]: r=%d\n", eob, ts->msac.rng);
439     } else {
440         eob = eob_bin;
441     }
442     assert(eob >= 0);
443 
444     // base tokens
445     uint16_t (*const eob_cdf)[4] = ts->cdf.coef.eob_base_tok[t_dim->ctx][chroma];
446     uint16_t (*const hi_cdf)[4] = ts->cdf.coef.br_tok[imin(t_dim->ctx, 3)][chroma];
447     unsigned rc, dc_tok;
448 
449     if (eob) {
450         uint16_t (*const lo_cdf)[4] = ts->cdf.coef.base_tok[t_dim->ctx][chroma];
451         uint8_t *const levels = t->scratch.levels; // bits 0-5: tok, 6-7: lo_tok
452         const int sw = imin(t_dim->w, 8), sh = imin(t_dim->h, 8);
453 
454         /* eob */
455         unsigned ctx = 1 + (eob > sw * sh * 2) + (eob > sw * sh * 4);
456         int eob_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[ctx], 2);
457         int tok = eob_tok + 1;
458         int level_tok = tok * 0x41;
459         unsigned mag;
460 
461 #define DECODE_COEFS_CLASS(tx_class) \
462         unsigned x, y; \
463         if (tx_class == TX_CLASS_2D) \
464             rc = scan[eob], x = rc >> shift, y = rc & mask; \
465         else if (tx_class == TX_CLASS_H) \
466             /* Transposing reduces the stride and padding requirements */ \
467             x = eob & mask, y = eob >> shift, rc = eob; \
468         else /* tx_class == TX_CLASS_V */ \
469             x = eob & mask, y = eob >> shift, rc = (x << shift2) | y; \
470         if (dbg) \
471             printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
472                    t_dim->ctx, chroma, ctx, eob, rc, tok, ts->msac.rng); \
473         if (eob_tok == 2) { \
474             ctx = (tx_class == TX_CLASS_2D ? (x | y) > 1 : y != 0) ? 14 : 7; \
475             tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
476             level_tok = tok + (3 << 6); \
477             if (dbg) \
478                 printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
479                        imin(t_dim->ctx, 3), chroma, ctx, eob, rc, tok, \
480                        ts->msac.rng); \
481         } \
482         cf[rc] = tok << 11; \
483         levels[x * stride + y] = (uint8_t) level_tok; \
484         for (int i = eob - 1; i > 0; i--) { /* ac */ \
485             unsigned rc_i; \
486             if (tx_class == TX_CLASS_2D) \
487                 rc_i = scan[i], x = rc_i >> shift, y = rc_i & mask; \
488             else if (tx_class == TX_CLASS_H) \
489                 x = i & mask, y = i >> shift, rc_i = i; \
490             else /* tx_class == TX_CLASS_V */ \
491                 x = i & mask, y = i >> shift, rc_i = (x << shift2) | y; \
492             assert(x < 32 && y < 32); \
493             uint8_t *const level = levels + x * stride + y; \
494             ctx = get_lo_ctx(level, tx_class, &mag, lo_ctx_offsets, x, y, stride); \
495             if (tx_class == TX_CLASS_2D) \
496                 y |= x; \
497             tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
498             if (dbg) \
499                 printf("Post-lo_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
500                        t_dim->ctx, chroma, ctx, i, rc_i, tok, ts->msac.rng); \
501             if (tok == 3) { \
502                 mag &= 63; \
503                 ctx = (y > (tx_class == TX_CLASS_2D) ? 14 : 7) + \
504                       (mag > 12 ? 6 : (mag + 1) >> 1); \
505                 tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
506                 if (dbg) \
507                     printf("Post-hi_tok[%d][%d][%d][%d=%d=%d]: r=%d\n", \
508                            imin(t_dim->ctx, 3), chroma, ctx, i, rc_i, tok, \
509                            ts->msac.rng); \
510                 *level = (uint8_t) (tok + (3 << 6)); \
511                 cf[rc_i] = (tok << 11) | rc; \
512                 rc = rc_i; \
513             } else { \
514                 /* 0x1 for tok, 0x7ff as bitmask for rc, 0x41 for level_tok */ \
515                 tok *= 0x17ff41; \
516                 *level = (uint8_t) tok; \
517                 /* tok ? (tok << 11) | rc : 0 */ \
518                 tok = (tok >> 9) & (rc + ~0x7ffu); \
519                 if (tok) rc = rc_i; \
520                 cf[rc_i] = tok; \
521             } \
522         } \
523         /* dc */ \
524         ctx = (tx_class == TX_CLASS_2D) ? 0 : \
525             get_lo_ctx(levels, tx_class, &mag, lo_ctx_offsets, 0, 0, stride); \
526         dc_tok = dav1d_msac_decode_symbol_adapt4(&ts->msac, lo_cdf[ctx], 3); \
527         if (dbg) \
528             printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n", \
529                    t_dim->ctx, chroma, ctx, dc_tok, ts->msac.rng); \
530         if (dc_tok == 3) { \
531             if (tx_class == TX_CLASS_2D) \
532                 mag = levels[0 * stride + 1] + levels[1 * stride + 0] + \
533                       levels[1 * stride + 1]; \
534             mag &= 63; \
535             ctx = mag > 12 ? 6 : (mag + 1) >> 1; \
536             dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[ctx]); \
537             if (dbg) \
538                 printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n", \
539                        imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng); \
540         } \
541         break
542 
543         const uint16_t *scan;
544         switch (tx_class) {
545         case TX_CLASS_2D: {
546             const unsigned nonsquare_tx = tx >= RTX_4X8;
547             const uint8_t (*const lo_ctx_offsets)[5] =
548                 dav1d_lo_ctx_offsets[nonsquare_tx + (tx & nonsquare_tx)];
549             scan = dav1d_scans[tx];
550             const ptrdiff_t stride = 4 * sh;
551             const unsigned shift = t_dim->lh < 4 ? t_dim->lh + 2 : 5, shift2 = 0;
552             const unsigned mask = 4 * sh - 1;
553             memset(levels, 0, stride * (4 * sw + 2));
554             DECODE_COEFS_CLASS(TX_CLASS_2D);
555         }
556         case TX_CLASS_H: {
557             const uint8_t (*const lo_ctx_offsets)[5] = NULL;
558             const ptrdiff_t stride = 16;
559             const unsigned shift = t_dim->lh + 2, shift2 = 0;
560             const unsigned mask = 4 * sh - 1;
561             memset(levels, 0, stride * (4 * sh + 2));
562             DECODE_COEFS_CLASS(TX_CLASS_H);
563         }
564         case TX_CLASS_V: {
565             const uint8_t (*const lo_ctx_offsets)[5] = NULL;
566             const ptrdiff_t stride = 16;
567             const unsigned shift = t_dim->lw + 2, shift2 = t_dim->lh + 2;
568             const unsigned mask = 4 * sw - 1;
569             memset(levels, 0, stride * (4 * sw + 2));
570             DECODE_COEFS_CLASS(TX_CLASS_V);
571         }
572 #undef DECODE_COEFS_CLASS
573         default: assert(0);
574         }
575     } else { // dc-only
576         int tok_br = dav1d_msac_decode_symbol_adapt4(&ts->msac, eob_cdf[0], 2);
577         dc_tok = 1 + tok_br;
578         if (dbg)
579             printf("Post-dc_lo_tok[%d][%d][%d][%d]: r=%d\n",
580                    t_dim->ctx, chroma, 0, dc_tok, ts->msac.rng);
581         if (tok_br == 2) {
582             dc_tok = dav1d_msac_decode_hi_tok(&ts->msac, hi_cdf[0]);
583             if (dbg)
584                 printf("Post-dc_hi_tok[%d][%d][0][%d]: r=%d\n",
585                        imin(t_dim->ctx, 3), chroma, dc_tok, ts->msac.rng);
586         }
587         rc = 0;
588     }
589 
590     // residual and sign
591     const uint16_t *const dq_tbl = ts->dq[b->seg_id][plane];
592     const uint8_t *const qm_tbl = *txtp < IDTX ? f->qm[tx][plane] : NULL;
593     const int dq_shift = imax(0, t_dim->ctx - 2);
594     const int cf_max = ~(~127U << (BITDEPTH == 8 ? 8 : f->cur.p.bpc));
595     unsigned cul_level, dc_sign_level;
596 
597     if (!dc_tok) {
598         cul_level = 0;
599         dc_sign_level = 1 << 6;
600         if (qm_tbl) goto ac_qm;
601         goto ac_noqm;
602     }
603 
604     const int dc_sign_ctx = get_dc_sign_ctx(tx, a, l);
605     uint16_t *const dc_sign_cdf = ts->cdf.coef.dc_sign[chroma][dc_sign_ctx];
606     const int dc_sign = dav1d_msac_decode_bool_adapt(&ts->msac, dc_sign_cdf);
607     if (dbg)
608         printf("Post-dc_sign[%d][%d][%d]: r=%d\n",
609                chroma, dc_sign_ctx, dc_sign, ts->msac.rng);
610 
611     int dc_dq = dq_tbl[0];
612     dc_sign_level = (dc_sign - 1) & (2 << 6);
613 
614     if (qm_tbl) {
615         dc_dq = (dc_dq * qm_tbl[0] + 16) >> 5;
616 
617         if (dc_tok == 15) {
618             dc_tok = read_golomb(&ts->msac) + 15;
619             if (dbg)
620                 printf("Post-dc_residual[%d->%d]: r=%d\n",
621                        dc_tok - 15, dc_tok, ts->msac.rng);
622 
623             dc_tok &= 0xfffff;
624             dc_dq = (dc_dq * dc_tok) & 0xffffff;
625         } else {
626             dc_dq *= dc_tok;
627             assert(dc_dq <= 0xffffff);
628         }
629         cul_level = dc_tok;
630         dc_dq >>= dq_shift;
631         dc_dq = umin(dc_dq, cf_max + dc_sign);
632         cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
633 
634         if (rc) ac_qm: {
635             const unsigned ac_dq = dq_tbl[1];
636             do {
637                 const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
638                 if (dbg)
639                     printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
640                 const unsigned rc_tok = cf[rc];
641                 unsigned tok, dq = (ac_dq * qm_tbl[rc] + 16) >> 5;
642                 int dq_sat;
643 
644                 if (rc_tok >= (15 << 11)) {
645                     tok = read_golomb(&ts->msac) + 15;
646                     if (dbg)
647                         printf("Post-residual[%d=%d->%d]: r=%d\n",
648                                rc, tok - 15, tok, ts->msac.rng);
649 
650                     tok &= 0xfffff;
651                     dq = (dq * tok) & 0xffffff;
652                 } else {
653                     tok = rc_tok >> 11;
654                     dq *= tok;
655                     assert(dq <= 0xffffff);
656                 }
657                 cul_level += tok;
658                 dq >>= dq_shift;
659                 dq_sat = umin(dq, cf_max + sign);
660                 cf[rc] = (coef) (sign ? -dq_sat : dq_sat);
661 
662                 rc = rc_tok & 0x3ff;
663             } while (rc);
664         }
665     } else {
666         // non-qmatrix is the common case and allows for additional optimizations
667         if (dc_tok == 15) {
668             dc_tok = read_golomb(&ts->msac) + 15;
669             if (dbg)
670                 printf("Post-dc_residual[%d->%d]: r=%d\n",
671                        dc_tok - 15, dc_tok, ts->msac.rng);
672 
673             dc_tok &= 0xfffff;
674             dc_dq = ((dc_dq * dc_tok) & 0xffffff) >> dq_shift;
675             dc_dq = umin(dc_dq, cf_max + dc_sign);
676         } else {
677             dc_dq = ((dc_dq * dc_tok) >> dq_shift);
678             assert(dc_dq <= cf_max);
679         }
680         cul_level = dc_tok;
681         cf[0] = (coef) (dc_sign ? -dc_dq : dc_dq);
682 
683         if (rc) ac_noqm: {
684             const unsigned ac_dq = dq_tbl[1];
685             do {
686                 const int sign = dav1d_msac_decode_bool_equi(&ts->msac);
687                 if (dbg)
688                     printf("Post-sign[%d=%d]: r=%d\n", rc, sign, ts->msac.rng);
689                 const unsigned rc_tok = cf[rc];
690                 unsigned tok;
691                 int dq;
692 
693                 // residual
694                 if (rc_tok >= (15 << 11)) {
695                     tok = read_golomb(&ts->msac) + 15;
696                     if (dbg)
697                         printf("Post-residual[%d=%d->%d]: r=%d\n",
698                                rc, tok - 15, tok, ts->msac.rng);
699 
700                     // coefficient parsing, see 5.11.39
701                     tok &= 0xfffff;
702 
703                     // dequant, see 7.12.3
704                     dq = ((ac_dq * tok) & 0xffffff) >> dq_shift;
705                     dq = umin(dq, cf_max + sign);
706                 } else {
707                     // cannot exceed cf_max, so we can avoid the clipping
708                     tok = rc_tok >> 11;
709                     dq = ((ac_dq * tok) >> dq_shift);
710                     assert(dq <= cf_max);
711                 }
712                 cul_level += tok;
713                 cf[rc] = (coef) (sign ? -dq : dq);
714 
715                 rc = rc_tok & 0x3ff; // next non-zero rc, zero if eob
716             } while (rc);
717         }
718     }
719 
720     // context
721     *res_ctx = umin(cul_level, 63) | dc_sign_level;
722 
723     return eob;
724 }
725 
read_coef_tree(Dav1dTaskContext * const t,const enum BlockSize bs,const Av1Block * const b,const enum RectTxfmSize ytx,const int depth,const uint16_t * const tx_split,const int x_off,const int y_off,pixel * dst)726 static void read_coef_tree(Dav1dTaskContext *const t,
727                            const enum BlockSize bs, const Av1Block *const b,
728                            const enum RectTxfmSize ytx, const int depth,
729                            const uint16_t *const tx_split,
730                            const int x_off, const int y_off, pixel *dst)
731 {
732     const Dav1dFrameContext *const f = t->f;
733     Dav1dTileState *const ts = t->ts;
734     const Dav1dDSPContext *const dsp = f->dsp;
735     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[ytx];
736     const int txw = t_dim->w, txh = t_dim->h;
737 
738     /* y_off can be larger than 3 since lossless blocks use TX_4X4 but can't
739      * be splitted. Aviods an undefined left shift. */
740     if (depth < 2 && tx_split[depth] &&
741         tx_split[depth] & (1 << (y_off * 4 + x_off)))
742     {
743         const enum RectTxfmSize sub = t_dim->sub;
744         const TxfmInfo *const sub_t_dim = &dav1d_txfm_dimensions[sub];
745         const int txsw = sub_t_dim->w, txsh = sub_t_dim->h;
746 
747         read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
748                        x_off * 2 + 0, y_off * 2 + 0, dst);
749         t->bx += txsw;
750         if (txw >= txh && t->bx < f->bw)
751             read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
752                            y_off * 2 + 0, dst ? &dst[4 * txsw] : NULL);
753         t->bx -= txsw;
754         t->by += txsh;
755         if (txh >= txw && t->by < f->bh) {
756             if (dst)
757                 dst += 4 * txsh * PXSTRIDE(f->cur.stride[0]);
758             read_coef_tree(t, bs, b, sub, depth + 1, tx_split,
759                            x_off * 2 + 0, y_off * 2 + 1, dst);
760             t->bx += txsw;
761             if (txw >= txh && t->bx < f->bw)
762                 read_coef_tree(t, bs, b, sub, depth + 1, tx_split, x_off * 2 + 1,
763                                y_off * 2 + 1, dst ? &dst[4 * txsw] : NULL);
764             t->bx -= txsw;
765         }
766         t->by -= txsh;
767     } else {
768         const int bx4 = t->bx & 31, by4 = t->by & 31;
769         enum TxfmType txtp;
770         uint8_t cf_ctx;
771         int eob;
772         coef *cf;
773 
774         if (t->frame_thread.pass) {
775             const int p = t->frame_thread.pass & 1;
776             assert(ts->frame_thread[p].cf);
777             cf = ts->frame_thread[p].cf;
778             ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
779         } else {
780             cf = bitfn(t->cf);
781         }
782         if (t->frame_thread.pass != 2) {
783             eob = decode_coefs(t, &t->a->lcoef[bx4], &t->l.lcoef[by4],
784                                ytx, bs, b, 0, 0, cf, &txtp, &cf_ctx);
785             if (DEBUG_BLOCK_INFO)
786                 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
787                        ytx, txtp, eob, ts->msac.rng);
788 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
789             rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
790 #define default_memset(dir, diridx, off, sz) \
791             memset(&t->dir lcoef[off], cf_ctx, sz)
792             case_set_upto16_with_default(imin(txh, f->bh - t->by), l., 1, by4);
793             case_set_upto16_with_default(imin(txw, f->bw - t->bx), a->, 0, bx4);
794 #undef default_memset
795 #undef set_ctx
796 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
797             for (int y = 0; y < txh; y++) { \
798                 rep_macro(type, txtp_map, 0, mul * txtp); \
799                 txtp_map += 32; \
800             }
801             uint8_t *txtp_map = &t->scratch.txtp_map[by4 * 32 + bx4];
802             case_set_upto16(txw,,,);
803 #undef set_ctx
804             if (t->frame_thread.pass == 1)
805                 *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
806         } else {
807             const int cbi = *ts->frame_thread[0].cbi++;
808             eob  = cbi >> 5;
809             txtp = cbi & 0x1f;
810         }
811         if (!(t->frame_thread.pass & 1)) {
812             assert(dst);
813             if (eob >= 0) {
814                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
815                     coef_dump(cf, imin(t_dim->h, 8) * 4, imin(t_dim->w, 8) * 4, 3, "dq");
816                 dsp->itx.itxfm_add[ytx][txtp](dst, f->cur.stride[0], cf, eob
817                                               HIGHBD_CALL_SUFFIX);
818                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
819                     hex_dump(dst, f->cur.stride[0], t_dim->w * 4, t_dim->h * 4, "recon");
820             }
821         }
822     }
823 }
824 
bytefn(dav1d_read_coef_blocks)825 void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
826                                     const enum BlockSize bs, const Av1Block *const b)
827 {
828     const Dav1dFrameContext *const f = t->f;
829     const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
830     const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
831     const int bx4 = t->bx & 31, by4 = t->by & 31;
832     const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
833     const uint8_t *const b_dim = dav1d_block_dimensions[bs];
834     const int bw4 = b_dim[0], bh4 = b_dim[1];
835     const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
836     const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
837                            (bw4 > ss_hor || t->bx & 1) &&
838                            (bh4 > ss_ver || t->by & 1);
839 
840     if (b->skip) {
841 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
842         rep_macro(type, t->dir lcoef, off, mul * 0x40)
843         case_set(bh4, l., 1, by4);
844         case_set(bw4, a->, 0, bx4);
845 #undef set_ctx
846         if (has_chroma) {
847 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
848             rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
849             rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
850             case_set(cbh4, l., 1, cby4);
851             case_set(cbw4, a->, 0, cbx4);
852 #undef set_ctx
853         }
854         return;
855     }
856 
857     Dav1dTileState *const ts = t->ts;
858     const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
859     const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
860     assert(t->frame_thread.pass == 1);
861     assert(!b->skip);
862     const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
863     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->intra ? b->tx : b->max_ytx];
864     const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
865 
866     for (int init_y = 0; init_y < h4; init_y += 16) {
867         const int sub_h4 = imin(h4, 16 + init_y);
868         for (int init_x = 0; init_x < w4; init_x += 16) {
869             const int sub_w4 = imin(w4, init_x + 16);
870             int y_off = !!init_y, y, x;
871             for (y = init_y, t->by += init_y; y < sub_h4;
872                  y += t_dim->h, t->by += t_dim->h, y_off++)
873             {
874                 int x_off = !!init_x;
875                 for (x = init_x, t->bx += init_x; x < sub_w4;
876                      x += t_dim->w, t->bx += t_dim->w, x_off++)
877                 {
878                     if (!b->intra) {
879                         read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
880                                        x_off, y_off, NULL);
881                     } else {
882                         uint8_t cf_ctx = 0x40;
883                         enum TxfmType txtp;
884                         const int eob =
885                             decode_coefs(t, &t->a->lcoef[bx4 + x],
886                                          &t->l.lcoef[by4 + y], b->tx, bs, b, 1,
887                                          0, ts->frame_thread[1].cf, &txtp, &cf_ctx);
888                         if (DEBUG_BLOCK_INFO)
889                             printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
890                                    b->tx, txtp, eob, ts->msac.rng);
891                         *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
892                         ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
893 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
894                         rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
895 #define default_memset(dir, diridx, off, sz) \
896                         memset(&t->dir lcoef[off], cf_ctx, sz)
897                         case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by),
898                                                      l., 1, by4 + y);
899                         case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx),
900                                                      a->, 0, bx4 + x);
901 #undef default_memset
902 #undef set_ctx
903                     }
904                 }
905                 t->bx -= x;
906             }
907             t->by -= y;
908 
909             if (!has_chroma) continue;
910 
911             const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
912             const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
913             for (int pl = 0; pl < 2; pl++) {
914                 for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
915                      y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
916                 {
917                     for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
918                          x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
919                     {
920                         uint8_t cf_ctx = 0x40;
921                         enum TxfmType txtp;
922                         if (!b->intra)
923                             txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
924                                                         bx4 + (x << ss_hor)];
925                         const int eob =
926                             decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
927                                          &t->l.ccoef[pl][cby4 + y], b->uvtx, bs,
928                                          b, b->intra, 1 + pl, ts->frame_thread[1].cf,
929                                          &txtp, &cf_ctx);
930                         if (DEBUG_BLOCK_INFO)
931                             printf("Post-uv-cf-blk[pl=%d,tx=%d,"
932                                    "txtp=%d,eob=%d]: r=%d\n",
933                                    pl, b->uvtx, txtp, eob, ts->msac.rng);
934                         *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
935                         ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
936 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
937                         rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
938 #define default_memset(dir, diridx, off, sz) \
939                         memset(&t->dir ccoef[pl][off], cf_ctx, sz)
940                         case_set_upto16_with_default( \
941                                  imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
942                                  l., 1, cby4 + y);
943                         case_set_upto16_with_default( \
944                                  imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
945                                  a->, 0, cbx4 + x);
946 #undef default_memset
947 #undef set_ctx
948                     }
949                     t->bx -= x << ss_hor;
950                 }
951                 t->by -= y << ss_ver;
952             }
953         }
954     }
955 }
956 
mc(Dav1dTaskContext * const t,pixel * const dst8,int16_t * const dst16,const ptrdiff_t dst_stride,const int bw4,const int bh4,const int bx,const int by,const int pl,const mv mv,const Dav1dThreadPicture * const refp,const int refidx,const enum Filter2d filter_2d)957 static int mc(Dav1dTaskContext *const t,
958               pixel *const dst8, int16_t *const dst16, const ptrdiff_t dst_stride,
959               const int bw4, const int bh4,
960               const int bx, const int by, const int pl,
961               const mv mv, const Dav1dThreadPicture *const refp, const int refidx,
962               const enum Filter2d filter_2d)
963 {
964     assert((dst8 != NULL) ^ (dst16 != NULL));
965     const Dav1dFrameContext *const f = t->f;
966     const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
967     const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
968     const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
969     const int mvx = mv.x, mvy = mv.y;
970     const int mx = mvx & (15 >> !ss_hor), my = mvy & (15 >> !ss_ver);
971     ptrdiff_t ref_stride = refp->p.stride[!!pl];
972     const pixel *ref;
973 
974     if (refp->p.p.w == f->cur.p.w && refp->p.p.h == f->cur.p.h) {
975         const int dx = bx * h_mul + (mvx >> (3 + ss_hor));
976         const int dy = by * v_mul + (mvy >> (3 + ss_ver));
977         int w, h;
978 
979         if (refp->p.data[0] != f->cur.data[0]) { // i.e. not for intrabc
980             w = (f->cur.p.w + ss_hor) >> ss_hor;
981             h = (f->cur.p.h + ss_ver) >> ss_ver;
982         } else {
983             w = f->bw * 4 >> ss_hor;
984             h = f->bh * 4 >> ss_ver;
985         }
986         if (dx < !!mx * 3 || dy < !!my * 3 ||
987             dx + bw4 * h_mul + !!mx * 4 > w ||
988             dy + bh4 * v_mul + !!my * 4 > h)
989         {
990             pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
991             f->dsp->mc.emu_edge(bw4 * h_mul + !!mx * 7, bh4 * v_mul + !!my * 7,
992                                 w, h, dx - !!mx * 3, dy - !!my * 3,
993                                 emu_edge_buf, 192 * sizeof(pixel),
994                                 refp->p.data[pl], ref_stride);
995             ref = &emu_edge_buf[192 * !!my * 3 + !!mx * 3];
996             ref_stride = 192 * sizeof(pixel);
997         } else {
998             ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
999         }
1000 
1001         if (dst8 != NULL) {
1002             f->dsp->mc.mc[filter_2d](dst8, dst_stride, ref, ref_stride, bw4 * h_mul,
1003                                      bh4 * v_mul, mx << !ss_hor, my << !ss_ver
1004                                      HIGHBD_CALL_SUFFIX);
1005         } else {
1006             f->dsp->mc.mct[filter_2d](dst16, ref, ref_stride, bw4 * h_mul,
1007                                       bh4 * v_mul, mx << !ss_hor, my << !ss_ver
1008                                       HIGHBD_CALL_SUFFIX);
1009         }
1010     } else {
1011         assert(refp != &f->sr_cur);
1012 
1013         const int orig_pos_y = (by * v_mul << 4) + mvy * (1 << !ss_ver);
1014         const int orig_pos_x = (bx * h_mul << 4) + mvx * (1 << !ss_hor);
1015 #define scale_mv(res, val, scale) do { \
1016             const int64_t tmp = (int64_t)(val) * scale + (scale - 0x4000) * 8; \
1017             res = apply_sign64((int) ((llabs(tmp) + 128) >> 8), tmp) + 32;     \
1018         } while (0)
1019         int pos_y, pos_x;
1020         scale_mv(pos_x, orig_pos_x, f->svc[refidx][0].scale);
1021         scale_mv(pos_y, orig_pos_y, f->svc[refidx][1].scale);
1022 #undef scale_mv
1023         const int left = pos_x >> 10;
1024         const int top = pos_y >> 10;
1025         const int right =
1026             ((pos_x + (bw4 * h_mul - 1) * f->svc[refidx][0].step) >> 10) + 1;
1027         const int bottom =
1028             ((pos_y + (bh4 * v_mul - 1) * f->svc[refidx][1].step) >> 10) + 1;
1029 
1030         if (DEBUG_BLOCK_INFO)
1031             printf("Off %dx%d [%d,%d,%d], size %dx%d [%d,%d]\n",
1032                    left, top, orig_pos_x, f->svc[refidx][0].scale, refidx,
1033                    right-left, bottom-top,
1034                    f->svc[refidx][0].step, f->svc[refidx][1].step);
1035 
1036         const int w = (refp->p.p.w + ss_hor) >> ss_hor;
1037         const int h = (refp->p.p.h + ss_ver) >> ss_ver;
1038         if (left < 3 || top < 3 || right + 4 > w || bottom + 4 > h) {
1039             pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
1040             f->dsp->mc.emu_edge(right - left + 7, bottom - top + 7,
1041                                 w, h, left - 3, top - 3,
1042                                 emu_edge_buf, 320 * sizeof(pixel),
1043                                 refp->p.data[pl], ref_stride);
1044             ref = &emu_edge_buf[320 * 3 + 3];
1045             ref_stride = 320 * sizeof(pixel);
1046             if (DEBUG_BLOCK_INFO) printf("Emu\n");
1047         } else {
1048             ref = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * top + left;
1049         }
1050 
1051         if (dst8 != NULL) {
1052             f->dsp->mc.mc_scaled[filter_2d](dst8, dst_stride, ref, ref_stride,
1053                                             bw4 * h_mul, bh4 * v_mul,
1054                                             pos_x & 0x3ff, pos_y & 0x3ff,
1055                                             f->svc[refidx][0].step,
1056                                             f->svc[refidx][1].step
1057                                             HIGHBD_CALL_SUFFIX);
1058         } else {
1059             f->dsp->mc.mct_scaled[filter_2d](dst16, ref, ref_stride,
1060                                              bw4 * h_mul, bh4 * v_mul,
1061                                              pos_x & 0x3ff, pos_y & 0x3ff,
1062                                              f->svc[refidx][0].step,
1063                                              f->svc[refidx][1].step
1064                                              HIGHBD_CALL_SUFFIX);
1065         }
1066     }
1067 
1068     return 0;
1069 }
1070 
obmc(Dav1dTaskContext * const t,pixel * const dst,const ptrdiff_t dst_stride,const uint8_t * const b_dim,const int pl,const int bx4,const int by4,const int w4,const int h4)1071 static int obmc(Dav1dTaskContext *const t,
1072                 pixel *const dst, const ptrdiff_t dst_stride,
1073                 const uint8_t *const b_dim, const int pl,
1074                 const int bx4, const int by4, const int w4, const int h4)
1075 {
1076     assert(!(t->bx & 1) && !(t->by & 1));
1077     const Dav1dFrameContext *const f = t->f;
1078     /*const*/ refmvs_block **r = &t->rt.r[(t->by & 31) + 5];
1079     pixel *const lap = bitfn(t->scratch.lap);
1080     const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1081     const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1082     const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
1083     int res;
1084 
1085     if (t->by > t->ts->tiling.row_start &&
1086         (!pl || b_dim[0] * h_mul + b_dim[1] * v_mul >= 16))
1087     {
1088         for (int i = 0, x = 0; x < w4 && i < imin(b_dim[2], 4); ) {
1089             // only odd blocks are considered for overlap handling, hence +1
1090             const refmvs_block *const a_r = &r[-1][t->bx + x + 1];
1091             const uint8_t *const a_b_dim = dav1d_block_dimensions[a_r->bs];
1092             const int step4 = iclip(a_b_dim[0], 2, 16);
1093 
1094             if (a_r->ref.ref[0] > 0) {
1095                 const int ow4 = imin(step4, b_dim[0]);
1096                 const int oh4 = imin(b_dim[1], 16) >> 1;
1097                 res = mc(t, lap, NULL, ow4 * h_mul * sizeof(pixel), ow4, (oh4 * 3 + 3) >> 2,
1098                          t->bx + x, t->by, pl, a_r->mv.mv[0],
1099                          &f->refp[a_r->ref.ref[0] - 1], a_r->ref.ref[0] - 1,
1100                          dav1d_filter_2d[t->a->filter[1][bx4 + x + 1]][t->a->filter[0][bx4 + x + 1]]);
1101                 if (res) return res;
1102                 f->dsp->mc.blend_h(&dst[x * h_mul], dst_stride, lap,
1103                                    h_mul * ow4, v_mul * oh4);
1104                 i++;
1105             }
1106             x += step4;
1107         }
1108     }
1109 
1110     if (t->bx > t->ts->tiling.col_start)
1111         for (int i = 0, y = 0; y < h4 && i < imin(b_dim[3], 4); ) {
1112             // only odd blocks are considered for overlap handling, hence +1
1113             const refmvs_block *const l_r = &r[y + 1][t->bx - 1];
1114             const uint8_t *const l_b_dim = dav1d_block_dimensions[l_r->bs];
1115             const int step4 = iclip(l_b_dim[1], 2, 16);
1116 
1117             if (l_r->ref.ref[0] > 0) {
1118                 const int ow4 = imin(b_dim[0], 16) >> 1;
1119                 const int oh4 = imin(step4, b_dim[1]);
1120                 res = mc(t, lap, NULL, h_mul * ow4 * sizeof(pixel), ow4, oh4,
1121                          t->bx, t->by + y, pl, l_r->mv.mv[0],
1122                          &f->refp[l_r->ref.ref[0] - 1], l_r->ref.ref[0] - 1,
1123                          dav1d_filter_2d[t->l.filter[1][by4 + y + 1]][t->l.filter[0][by4 + y + 1]]);
1124                 if (res) return res;
1125                 f->dsp->mc.blend_v(&dst[y * v_mul * PXSTRIDE(dst_stride)],
1126                                    dst_stride, lap, h_mul * ow4, v_mul * oh4);
1127                 i++;
1128             }
1129             y += step4;
1130         }
1131     return 0;
1132 }
1133 
warp_affine(Dav1dTaskContext * const t,pixel * dst8,int16_t * dst16,const ptrdiff_t dstride,const uint8_t * const b_dim,const int pl,const Dav1dThreadPicture * const refp,const Dav1dWarpedMotionParams * const wmp)1134 static int warp_affine(Dav1dTaskContext *const t,
1135                        pixel *dst8, int16_t *dst16, const ptrdiff_t dstride,
1136                        const uint8_t *const b_dim, const int pl,
1137                        const Dav1dThreadPicture *const refp,
1138                        const Dav1dWarpedMotionParams *const wmp)
1139 {
1140     assert((dst8 != NULL) ^ (dst16 != NULL));
1141     const Dav1dFrameContext *const f = t->f;
1142     const Dav1dDSPContext *const dsp = f->dsp;
1143     const int ss_ver = !!pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1144     const int ss_hor = !!pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1145     const int h_mul = 4 >> ss_hor, v_mul = 4 >> ss_ver;
1146     assert(!((b_dim[0] * h_mul) & 7) && !((b_dim[1] * v_mul) & 7));
1147     const int32_t *const mat = wmp->matrix;
1148     const int width = (refp->p.p.w + ss_hor) >> ss_hor;
1149     const int height = (refp->p.p.h + ss_ver) >> ss_ver;
1150 
1151     for (int y = 0; y < b_dim[1] * v_mul; y += 8) {
1152         const int src_y = t->by * 4 + ((y + 4) << ss_ver);
1153         const int64_t mat3_y = (int64_t) mat[3] * src_y + mat[0];
1154         const int64_t mat5_y = (int64_t) mat[5] * src_y + mat[1];
1155         for (int x = 0; x < b_dim[0] * h_mul; x += 8) {
1156             // calculate transformation relative to center of 8x8 block in
1157             // luma pixel units
1158             const int src_x = t->bx * 4 + ((x + 4) << ss_hor);
1159             const int64_t mvx = ((int64_t) mat[2] * src_x + mat3_y) >> ss_hor;
1160             const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
1161 
1162             const int dx = (int) (mvx >> 16) - 4;
1163             const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
1164                                                    wmp->u.p.beta  * 7) & ~0x3f;
1165             const int dy = (int) (mvy >> 16) - 4;
1166             const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
1167                                                    wmp->u.p.delta * 4) & ~0x3f;
1168 
1169             const pixel *ref_ptr;
1170             ptrdiff_t ref_stride = refp->p.stride[!!pl];
1171 
1172             if (dx < 3 || dx + 8 + 4 > width || dy < 3 || dy + 8 + 4 > height) {
1173                 pixel *const emu_edge_buf = bitfn(t->scratch.emu_edge);
1174                 f->dsp->mc.emu_edge(15, 15, width, height, dx - 3, dy - 3,
1175                                     emu_edge_buf, 32 * sizeof(pixel),
1176                                     refp->p.data[pl], ref_stride);
1177                 ref_ptr = &emu_edge_buf[32 * 3 + 3];
1178                 ref_stride = 32 * sizeof(pixel);
1179             } else {
1180                 ref_ptr = ((pixel *) refp->p.data[pl]) + PXSTRIDE(ref_stride) * dy + dx;
1181             }
1182             if (dst16 != NULL)
1183                 dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
1184                                  wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
1185             else
1186                 dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
1187                                 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
1188         }
1189         if (dst8) dst8  += 8 * PXSTRIDE(dstride);
1190         else      dst16 += 8 * dstride;
1191     }
1192     return 0;
1193 }
1194 
bytefn(dav1d_recon_b_intra)1195 void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize bs,
1196                                  const enum EdgeFlags intra_edge_flags,
1197                                  const Av1Block *const b)
1198 {
1199     Dav1dTileState *const ts = t->ts;
1200     const Dav1dFrameContext *const f = t->f;
1201     const Dav1dDSPContext *const dsp = f->dsp;
1202     const int bx4 = t->bx & 31, by4 = t->by & 31;
1203     const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1204     const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1205     const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1206     const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1207     const int bw4 = b_dim[0], bh4 = b_dim[1];
1208     const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1209     const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1210     const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1211                            (bw4 > ss_hor || t->bx & 1) &&
1212                            (bh4 > ss_ver || t->by & 1);
1213     const TxfmInfo *const t_dim = &dav1d_txfm_dimensions[b->tx];
1214     const TxfmInfo *const uv_t_dim = &dav1d_txfm_dimensions[b->uvtx];
1215 
1216     // coefficient coding
1217     pixel *const edge = bitfn(t->scratch.edge) + 128;
1218     const int cbw4 = (bw4 + ss_hor) >> ss_hor, cbh4 = (bh4 + ss_ver) >> ss_ver;
1219 
1220     const int intra_edge_filter_flag = f->seq_hdr->intra_edge_filter << 10;
1221 
1222     for (int init_y = 0; init_y < h4; init_y += 16) {
1223         const int sub_h4 = imin(h4, 16 + init_y);
1224         const int sub_ch4 = imin(ch4, (init_y + 16) >> ss_ver);
1225         for (int init_x = 0; init_x < w4; init_x += 16) {
1226             if (b->pal_sz[0]) {
1227                 pixel *dst = ((pixel *) f->cur.data[0]) +
1228                              4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1229                 const uint8_t *pal_idx;
1230                 if (t->frame_thread.pass) {
1231                     const int p = t->frame_thread.pass & 1;
1232                     assert(ts->frame_thread[p].pal_idx);
1233                     pal_idx = ts->frame_thread[p].pal_idx;
1234                     ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
1235                 } else {
1236                     pal_idx = t->scratch.pal_idx_y;
1237                 }
1238                 const pixel *const pal = t->frame_thread.pass ?
1239                     f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1240                                         ((t->bx >> 1) + (t->by & 1))][0] :
1241                     bytefn(t->scratch.pal)[0];
1242                 f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
1243                                        pal_idx, bw4 * 4, bh4 * 4);
1244                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1245                     hex_dump(dst, PXSTRIDE(f->cur.stride[0]),
1246                              bw4 * 4, bh4 * 4, "y-pal-pred");
1247             }
1248 
1249             const int intra_flags = (sm_flag(t->a, bx4) |
1250                                      sm_flag(&t->l, by4) |
1251                                      intra_edge_filter_flag);
1252             const int sb_has_tr = init_x + 16 < w4 ? 1 : init_y ? 0 :
1253                               intra_edge_flags & EDGE_I444_TOP_HAS_RIGHT;
1254             const int sb_has_bl = init_x ? 0 : init_y + 16 < h4 ? 1 :
1255                               intra_edge_flags & EDGE_I444_LEFT_HAS_BOTTOM;
1256             int y, x;
1257             const int sub_w4 = imin(w4, init_x + 16);
1258             for (y = init_y, t->by += init_y; y < sub_h4;
1259                  y += t_dim->h, t->by += t_dim->h)
1260             {
1261                 pixel *dst = ((pixel *) f->cur.data[0]) +
1262                                4 * (t->by * PXSTRIDE(f->cur.stride[0]) +
1263                                     t->bx + init_x);
1264                 for (x = init_x, t->bx += init_x; x < sub_w4;
1265                      x += t_dim->w, t->bx += t_dim->w)
1266                 {
1267                     if (b->pal_sz[0]) goto skip_y_pred;
1268 
1269                     int angle = b->y_angle;
1270                     const enum EdgeFlags edge_flags =
1271                         (((y > init_y || !sb_has_tr) && (x + t_dim->w >= sub_w4)) ?
1272                              0 : EDGE_I444_TOP_HAS_RIGHT) |
1273                         ((x > init_x || (!sb_has_bl && y + t_dim->h >= sub_h4)) ?
1274                              0 : EDGE_I444_LEFT_HAS_BOTTOM);
1275                     const pixel *top_sb_edge = NULL;
1276                     if (!(t->by & (f->sb_step - 1))) {
1277                         top_sb_edge = f->ipred_edge[0];
1278                         const int sby = t->by >> f->sb_shift;
1279                         top_sb_edge += f->sb128w * 128 * (sby - 1);
1280                     }
1281                     const enum IntraPredMode m =
1282                         bytefn(dav1d_prepare_intra_edges)(t->bx,
1283                                                           t->bx > ts->tiling.col_start,
1284                                                           t->by,
1285                                                           t->by > ts->tiling.row_start,
1286                                                           ts->tiling.col_end,
1287                                                           ts->tiling.row_end,
1288                                                           edge_flags, dst,
1289                                                           f->cur.stride[0], top_sb_edge,
1290                                                           b->y_mode, &angle,
1291                                                           t_dim->w, t_dim->h,
1292                                                           f->seq_hdr->intra_edge_filter,
1293                                                           edge HIGHBD_CALL_SUFFIX);
1294                     dsp->ipred.intra_pred[m](dst, f->cur.stride[0], edge,
1295                                              t_dim->w * 4, t_dim->h * 4,
1296                                              angle | intra_flags,
1297                                              4 * f->bw - 4 * t->bx,
1298                                              4 * f->bh - 4 * t->by
1299                                              HIGHBD_CALL_SUFFIX);
1300 
1301                     if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1302                         hex_dump(edge - t_dim->h * 4, t_dim->h * 4,
1303                                  t_dim->h * 4, 2, "l");
1304                         hex_dump(edge, 0, 1, 1, "tl");
1305                         hex_dump(edge + 1, t_dim->w * 4,
1306                                  t_dim->w * 4, 2, "t");
1307                         hex_dump(dst, f->cur.stride[0],
1308                                  t_dim->w * 4, t_dim->h * 4, "y-intra-pred");
1309                     }
1310 
1311                 skip_y_pred: {}
1312                     if (!b->skip) {
1313                         coef *cf;
1314                         int eob;
1315                         enum TxfmType txtp;
1316                         if (t->frame_thread.pass) {
1317                             const int p = t->frame_thread.pass & 1;
1318                             const int cbi = *ts->frame_thread[p].cbi++;
1319                             cf = ts->frame_thread[p].cf;
1320                             ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
1321                             eob  = cbi >> 5;
1322                             txtp = cbi & 0x1f;
1323                         } else {
1324                             uint8_t cf_ctx;
1325                             cf = bitfn(t->cf);
1326                             eob = decode_coefs(t, &t->a->lcoef[bx4 + x],
1327                                                &t->l.lcoef[by4 + y], b->tx, bs,
1328                                                b, 1, 0, cf, &txtp, &cf_ctx);
1329                             if (DEBUG_BLOCK_INFO)
1330                                 printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
1331                                        b->tx, txtp, eob, ts->msac.rng);
1332 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
1333                             rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
1334 #define default_memset(dir, diridx, off, sz) \
1335                             memset(&t->dir lcoef[off], cf_ctx, sz)
1336                             case_set_upto16_with_default(imin(t_dim->h, f->bh - t->by), \
1337                                                          l., 1, by4 + y);
1338                             case_set_upto16_with_default(imin(t_dim->w, f->bw - t->bx), \
1339                                                          a->, 0, bx4 + x);
1340 #undef default_memset
1341 #undef set_ctx
1342                         }
1343                         if (eob >= 0) {
1344                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1345                                 coef_dump(cf, imin(t_dim->h, 8) * 4,
1346                                           imin(t_dim->w, 8) * 4, 3, "dq");
1347                             dsp->itx.itxfm_add[b->tx]
1348                                               [txtp](dst,
1349                                                      f->cur.stride[0],
1350                                                      cf, eob HIGHBD_CALL_SUFFIX);
1351                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1352                                 hex_dump(dst, f->cur.stride[0],
1353                                          t_dim->w * 4, t_dim->h * 4, "recon");
1354                         }
1355                     } else if (!t->frame_thread.pass) {
1356 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
1357                         rep_macro(type, t->dir lcoef, off, mul * 0x40)
1358                         case_set_upto16(t_dim->h, l., 1, by4 + y);
1359                         case_set_upto16(t_dim->w, a->, 0, bx4 + x);
1360 #undef set_ctx
1361                     }
1362                     dst += 4 * t_dim->w;
1363                 }
1364                 t->bx -= x;
1365             }
1366             t->by -= y;
1367 
1368             if (!has_chroma) continue;
1369 
1370             const ptrdiff_t stride = f->cur.stride[1];
1371 
1372             if (b->uv_mode == CFL_PRED) {
1373                 assert(!init_x && !init_y);
1374 
1375                 int16_t *const ac = t->scratch.ac;
1376                 pixel *y_src = ((pixel *) f->cur.data[0]) + 4 * (t->bx & ~ss_hor) +
1377                                  4 * (t->by & ~ss_ver) * PXSTRIDE(f->cur.stride[0]);
1378                 const ptrdiff_t uv_off = 4 * ((t->bx >> ss_hor) +
1379                                               (t->by >> ss_ver) * PXSTRIDE(stride));
1380                 pixel *const uv_dst[2] = { ((pixel *) f->cur.data[1]) + uv_off,
1381                                            ((pixel *) f->cur.data[2]) + uv_off };
1382 
1383                 const int furthest_r =
1384                     ((cw4 << ss_hor) + t_dim->w - 1) & ~(t_dim->w - 1);
1385                 const int furthest_b =
1386                     ((ch4 << ss_ver) + t_dim->h - 1) & ~(t_dim->h - 1);
1387                 dsp->ipred.cfl_ac[f->cur.p.layout - 1](ac, y_src, f->cur.stride[0],
1388                                                          cbw4 - (furthest_r >> ss_hor),
1389                                                          cbh4 - (furthest_b >> ss_ver),
1390                                                          cbw4 * 4, cbh4 * 4);
1391                 for (int pl = 0; pl < 2; pl++) {
1392                     if (!b->cfl_alpha[pl]) continue;
1393                     int angle = 0;
1394                     const pixel *top_sb_edge = NULL;
1395                     if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1396                         top_sb_edge = f->ipred_edge[pl + 1];
1397                         const int sby = t->by >> f->sb_shift;
1398                         top_sb_edge += f->sb128w * 128 * (sby - 1);
1399                     }
1400                     const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1401                     const int xstart = ts->tiling.col_start >> ss_hor;
1402                     const int ystart = ts->tiling.row_start >> ss_ver;
1403                     const enum IntraPredMode m =
1404                         bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1405                                                           ypos, ypos > ystart,
1406                                                           ts->tiling.col_end >> ss_hor,
1407                                                           ts->tiling.row_end >> ss_ver,
1408                                                           0, uv_dst[pl], stride,
1409                                                           top_sb_edge, DC_PRED, &angle,
1410                                                           uv_t_dim->w, uv_t_dim->h, 0,
1411                                                           edge HIGHBD_CALL_SUFFIX);
1412                     dsp->ipred.cfl_pred[m](uv_dst[pl], stride, edge,
1413                                            uv_t_dim->w * 4,
1414                                            uv_t_dim->h * 4,
1415                                            ac, b->cfl_alpha[pl]
1416                                            HIGHBD_CALL_SUFFIX);
1417                 }
1418                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1419                     ac_dump(ac, 4*cbw4, 4*cbh4, "ac");
1420                     hex_dump(uv_dst[0], stride, cbw4 * 4, cbh4 * 4, "u-cfl-pred");
1421                     hex_dump(uv_dst[1], stride, cbw4 * 4, cbh4 * 4, "v-cfl-pred");
1422                 }
1423             } else if (b->pal_sz[1]) {
1424                 const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
1425                                               (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1426                 const pixel (*pal)[8];
1427                 const uint8_t *pal_idx;
1428                 if (t->frame_thread.pass) {
1429                     const int p = t->frame_thread.pass & 1;
1430                     assert(ts->frame_thread[p].pal_idx);
1431                     pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
1432                                               ((t->bx >> 1) + (t->by & 1))];
1433                     pal_idx = ts->frame_thread[p].pal_idx;
1434                     ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
1435                 } else {
1436                     pal = bytefn(t->scratch.pal);
1437                     pal_idx = t->scratch.pal_idx_uv;
1438                 }
1439 
1440                 f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
1441                                        f->cur.stride[1], pal[1],
1442                                        pal_idx, cbw4 * 4, cbh4 * 4);
1443                 f->dsp->ipred.pal_pred(((pixel *) f->cur.data[2]) + uv_dstoff,
1444                                        f->cur.stride[1], pal[2],
1445                                        pal_idx, cbw4 * 4, cbh4 * 4);
1446                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1447                     hex_dump(((pixel *) f->cur.data[1]) + uv_dstoff,
1448                              PXSTRIDE(f->cur.stride[1]),
1449                              cbw4 * 4, cbh4 * 4, "u-pal-pred");
1450                     hex_dump(((pixel *) f->cur.data[2]) + uv_dstoff,
1451                              PXSTRIDE(f->cur.stride[1]),
1452                              cbw4 * 4, cbh4 * 4, "v-pal-pred");
1453                 }
1454             }
1455 
1456             const int sm_uv_fl = sm_uv_flag(t->a, cbx4) |
1457                                  sm_uv_flag(&t->l, cby4);
1458             const int uv_sb_has_tr =
1459                 ((init_x + 16) >> ss_hor) < cw4 ? 1 : init_y ? 0 :
1460                 intra_edge_flags & (EDGE_I420_TOP_HAS_RIGHT >> (f->cur.p.layout - 1));
1461             const int uv_sb_has_bl =
1462                 init_x ? 0 : ((init_y + 16) >> ss_ver) < ch4 ? 1 :
1463                 intra_edge_flags & (EDGE_I420_LEFT_HAS_BOTTOM >> (f->cur.p.layout - 1));
1464             const int sub_cw4 = imin(cw4, (init_x + 16) >> ss_hor);
1465             for (int pl = 0; pl < 2; pl++) {
1466                 for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
1467                      y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
1468                 {
1469                     pixel *dst = ((pixel *) f->cur.data[1 + pl]) +
1470                                    4 * ((t->by >> ss_ver) * PXSTRIDE(stride) +
1471                                         ((t->bx + init_x) >> ss_hor));
1472                     for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
1473                          x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
1474                     {
1475                         if ((b->uv_mode == CFL_PRED && b->cfl_alpha[pl]) ||
1476                             b->pal_sz[1])
1477                         {
1478                             goto skip_uv_pred;
1479                         }
1480 
1481                         int angle = b->uv_angle;
1482                         // this probably looks weird because we're using
1483                         // luma flags in a chroma loop, but that's because
1484                         // prepare_intra_edges() expects luma flags as input
1485                         const enum EdgeFlags edge_flags =
1486                             (((y > (init_y >> ss_ver) || !uv_sb_has_tr) &&
1487                               (x + uv_t_dim->w >= sub_cw4)) ?
1488                                  0 : EDGE_I444_TOP_HAS_RIGHT) |
1489                             ((x > (init_x >> ss_hor) ||
1490                               (!uv_sb_has_bl && y + uv_t_dim->h >= sub_ch4)) ?
1491                                  0 : EDGE_I444_LEFT_HAS_BOTTOM);
1492                         const pixel *top_sb_edge = NULL;
1493                         if (!((t->by & ~ss_ver) & (f->sb_step - 1))) {
1494                             top_sb_edge = f->ipred_edge[1 + pl];
1495                             const int sby = t->by >> f->sb_shift;
1496                             top_sb_edge += f->sb128w * 128 * (sby - 1);
1497                         }
1498                         const enum IntraPredMode uv_mode =
1499                              b->uv_mode == CFL_PRED ? DC_PRED : b->uv_mode;
1500                         const int xpos = t->bx >> ss_hor, ypos = t->by >> ss_ver;
1501                         const int xstart = ts->tiling.col_start >> ss_hor;
1502                         const int ystart = ts->tiling.row_start >> ss_ver;
1503                         const enum IntraPredMode m =
1504                             bytefn(dav1d_prepare_intra_edges)(xpos, xpos > xstart,
1505                                                               ypos, ypos > ystart,
1506                                                               ts->tiling.col_end >> ss_hor,
1507                                                               ts->tiling.row_end >> ss_ver,
1508                                                               edge_flags, dst, stride,
1509                                                               top_sb_edge, uv_mode,
1510                                                               &angle, uv_t_dim->w,
1511                                                               uv_t_dim->h,
1512                                                               f->seq_hdr->intra_edge_filter,
1513                                                               edge HIGHBD_CALL_SUFFIX);
1514                         angle |= intra_edge_filter_flag;
1515                         dsp->ipred.intra_pred[m](dst, stride, edge,
1516                                                  uv_t_dim->w * 4,
1517                                                  uv_t_dim->h * 4,
1518                                                  angle | sm_uv_fl,
1519                                                  (4 * f->bw + ss_hor -
1520                                                   4 * (t->bx & ~ss_hor)) >> ss_hor,
1521                                                  (4 * f->bh + ss_ver -
1522                                                   4 * (t->by & ~ss_ver)) >> ss_ver
1523                                                  HIGHBD_CALL_SUFFIX);
1524                         if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1525                             hex_dump(edge - uv_t_dim->h * 4, uv_t_dim->h * 4,
1526                                      uv_t_dim->h * 4, 2, "l");
1527                             hex_dump(edge, 0, 1, 1, "tl");
1528                             hex_dump(edge + 1, uv_t_dim->w * 4,
1529                                      uv_t_dim->w * 4, 2, "t");
1530                             hex_dump(dst, stride, uv_t_dim->w * 4,
1531                                      uv_t_dim->h * 4, pl ? "v-intra-pred" : "u-intra-pred");
1532                         }
1533 
1534                     skip_uv_pred: {}
1535                         if (!b->skip) {
1536                             enum TxfmType txtp;
1537                             int eob;
1538                             coef *cf;
1539                             if (t->frame_thread.pass) {
1540                                 const int p = t->frame_thread.pass & 1;
1541                                 const int cbi = *ts->frame_thread[p].cbi++;
1542                                 cf = ts->frame_thread[p].cf;
1543                                 ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
1544                                 eob  = cbi >> 5;
1545                                 txtp = cbi & 0x1f;
1546                             } else {
1547                                 uint8_t cf_ctx;
1548                                 cf = bitfn(t->cf);
1549                                 eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1550                                                    &t->l.ccoef[pl][cby4 + y],
1551                                                    b->uvtx, bs, b, 1, 1 + pl, cf,
1552                                                    &txtp, &cf_ctx);
1553                                 if (DEBUG_BLOCK_INFO)
1554                                     printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1555                                            "txtp=%d,eob=%d]: r=%d [x=%d,cbx4=%d]\n",
1556                                            pl, b->uvtx, txtp, eob, ts->msac.rng, x, cbx4);
1557 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
1558                                 rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
1559 #define default_memset(dir, diridx, off, sz) \
1560                                 memset(&t->dir ccoef[pl][off], cf_ctx, sz)
1561                                 case_set_upto16_with_default( \
1562                                          imin(uv_t_dim->h, (f->bh - t->by + ss_ver) >> ss_ver),
1563                                          l., 1, cby4 + y);
1564                                 case_set_upto16_with_default( \
1565                                          imin(uv_t_dim->w, (f->bw - t->bx + ss_hor) >> ss_hor),
1566                                          a->, 0, cbx4 + x);
1567 #undef default_memset
1568 #undef set_ctx
1569                             }
1570                             if (eob >= 0) {
1571                                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1572                                     coef_dump(cf, uv_t_dim->h * 4,
1573                                               uv_t_dim->w * 4, 3, "dq");
1574                                 dsp->itx.itxfm_add[b->uvtx]
1575                                                   [txtp](dst, stride,
1576                                                          cf, eob HIGHBD_CALL_SUFFIX);
1577                                 if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
1578                                     hex_dump(dst, stride, uv_t_dim->w * 4,
1579                                              uv_t_dim->h * 4, "recon");
1580                             }
1581                         } else if (!t->frame_thread.pass) {
1582 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
1583                             rep_macro(type, t->dir ccoef[pl], off, mul * 0x40)
1584                             case_set_upto16(uv_t_dim->h, l., 1, cby4 + y);
1585                             case_set_upto16(uv_t_dim->w, a->, 0, cbx4 + x);
1586 #undef set_ctx
1587                         }
1588                         dst += uv_t_dim->w * 4;
1589                     }
1590                     t->bx -= x << ss_hor;
1591                 }
1592                 t->by -= y << ss_ver;
1593             }
1594         }
1595     }
1596 }
1597 
bytefn(dav1d_recon_b_inter)1598 int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize bs,
1599                                 const Av1Block *const b)
1600 {
1601     Dav1dTileState *const ts = t->ts;
1602     const Dav1dFrameContext *const f = t->f;
1603     const Dav1dDSPContext *const dsp = f->dsp;
1604     const int bx4 = t->bx & 31, by4 = t->by & 31;
1605     const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
1606     const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
1607     const int cbx4 = bx4 >> ss_hor, cby4 = by4 >> ss_ver;
1608     const uint8_t *const b_dim = dav1d_block_dimensions[bs];
1609     const int bw4 = b_dim[0], bh4 = b_dim[1];
1610     const int w4 = imin(bw4, f->bw - t->bx), h4 = imin(bh4, f->bh - t->by);
1611     const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400 &&
1612                            (bw4 > ss_hor || t->bx & 1) &&
1613                            (bh4 > ss_ver || t->by & 1);
1614     const int chr_layout_idx = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I400 ? 0 :
1615                                DAV1D_PIXEL_LAYOUT_I444 - f->cur.p.layout;
1616     int res;
1617 
1618     // prediction
1619     const int cbh4 = (bh4 + ss_ver) >> ss_ver, cbw4 = (bw4 + ss_hor) >> ss_hor;
1620     pixel *dst = ((pixel *) f->cur.data[0]) +
1621         4 * (t->by * PXSTRIDE(f->cur.stride[0]) + t->bx);
1622     const ptrdiff_t uvdstoff =
1623         4 * ((t->bx >> ss_hor) + (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
1624     if (IS_KEY_OR_INTRA(f->frame_hdr)) {
1625         // intrabc
1626         assert(!f->frame_hdr->super_res.enabled);
1627         res = mc(t, dst, NULL, f->cur.stride[0], bw4, bh4, t->bx, t->by, 0,
1628                  b->mv[0], &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1629         if (res) return res;
1630         if (has_chroma) for (int pl = 1; pl < 3; pl++) {
1631             res = mc(t, ((pixel *)f->cur.data[pl]) + uvdstoff, NULL, f->cur.stride[1],
1632                      bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1633                      t->bx & ~ss_hor, t->by & ~ss_ver, pl, b->mv[0],
1634                      &f->sr_cur, 0 /* unused */, FILTER_2D_BILINEAR);
1635             if (res) return res;
1636         }
1637     } else if (b->comp_type == COMP_INTER_NONE) {
1638         const Dav1dThreadPicture *const refp = &f->refp[b->ref[0]];
1639         const enum Filter2d filter_2d = b->filter2d;
1640 
1641         if (imin(bw4, bh4) > 1 &&
1642             ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1643              (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1644         {
1645             res = warp_affine(t, dst, NULL, f->cur.stride[0], b_dim, 0, refp,
1646                               b->motion_mode == MM_WARP ? &t->warpmv :
1647                                   &f->frame_hdr->gmv[b->ref[0]]);
1648             if (res) return res;
1649         } else {
1650             res = mc(t, dst, NULL, f->cur.stride[0],
1651                      bw4, bh4, t->bx, t->by, 0, b->mv[0], refp, b->ref[0], filter_2d);
1652             if (res) return res;
1653             if (b->motion_mode == MM_OBMC) {
1654                 res = obmc(t, dst, f->cur.stride[0], b_dim, 0, bx4, by4, w4, h4);
1655                 if (res) return res;
1656             }
1657         }
1658         if (b->interintra_type) {
1659             pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1660             enum IntraPredMode m = b->interintra_mode == II_SMOOTH_PRED ?
1661                                    SMOOTH_PRED : b->interintra_mode;
1662             pixel *const tmp = bitfn(t->scratch.interintra);
1663             int angle = 0;
1664             const pixel *top_sb_edge = NULL;
1665             if (!(t->by & (f->sb_step - 1))) {
1666                 top_sb_edge = f->ipred_edge[0];
1667                 const int sby = t->by >> f->sb_shift;
1668                 top_sb_edge += f->sb128w * 128 * (sby - 1);
1669             }
1670             m = bytefn(dav1d_prepare_intra_edges)(t->bx, t->bx > ts->tiling.col_start,
1671                                                   t->by, t->by > ts->tiling.row_start,
1672                                                   ts->tiling.col_end, ts->tiling.row_end,
1673                                                   0, dst, f->cur.stride[0], top_sb_edge,
1674                                                   m, &angle, bw4, bh4, 0, tl_edge
1675                                                   HIGHBD_CALL_SUFFIX);
1676             dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
1677                                      tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
1678                                      HIGHBD_CALL_SUFFIX);
1679             dsp->mc.blend(dst, f->cur.stride[0], tmp,
1680                           bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
1681         }
1682 
1683         if (!has_chroma) goto skip_inter_chroma_pred;
1684 
1685         // sub8x8 derivation
1686         int is_sub8x8 = bw4 == ss_hor || bh4 == ss_ver;
1687         refmvs_block *const *r;
1688         if (is_sub8x8) {
1689             assert(ss_hor == 1);
1690             r = &t->rt.r[(t->by & 31) + 5];
1691             if (bw4 == 1) is_sub8x8 &= r[0][t->bx - 1].ref.ref[0] > 0;
1692             if (bh4 == ss_ver) is_sub8x8 &= r[-1][t->bx].ref.ref[0] > 0;
1693             if (bw4 == 1 && bh4 == ss_ver)
1694                 is_sub8x8 &= r[-1][t->bx - 1].ref.ref[0] > 0;
1695         }
1696 
1697         // chroma prediction
1698         if (is_sub8x8) {
1699             assert(ss_hor == 1);
1700             ptrdiff_t h_off = 0, v_off = 0;
1701             if (bw4 == 1 && bh4 == ss_ver) {
1702                 for (int pl = 0; pl < 2; pl++) {
1703                     res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1704                              NULL, f->cur.stride[1],
1705                              bw4, bh4, t->bx - 1, t->by - 1, 1 + pl,
1706                              r[-1][t->bx - 1].mv.mv[0],
1707                              &f->refp[r[-1][t->bx - 1].ref.ref[0] - 1],
1708                              r[-1][t->bx - 1].ref.ref[0] - 1,
1709                              t->frame_thread.pass != 2 ? t->tl_4x4_filter :
1710                                  f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx - 1].filter2d);
1711                     if (res) return res;
1712                 }
1713                 v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1714                 h_off = 2;
1715             }
1716             if (bw4 == 1) {
1717                 const enum Filter2d left_filter_2d =
1718                     dav1d_filter_2d[t->l.filter[1][by4]][t->l.filter[0][by4]];
1719                 for (int pl = 0; pl < 2; pl++) {
1720                     res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + v_off, NULL,
1721                              f->cur.stride[1], bw4, bh4, t->bx - 1,
1722                              t->by, 1 + pl, r[0][t->bx - 1].mv.mv[0],
1723                              &f->refp[r[0][t->bx - 1].ref.ref[0] - 1],
1724                              r[0][t->bx - 1].ref.ref[0] - 1,
1725                              t->frame_thread.pass != 2 ? left_filter_2d :
1726                                  f->frame_thread.b[(t->by * f->b4_stride) + t->bx - 1].filter2d);
1727                     if (res) return res;
1728                 }
1729                 h_off = 2;
1730             }
1731             if (bh4 == ss_ver) {
1732                 const enum Filter2d top_filter_2d =
1733                     dav1d_filter_2d[t->a->filter[1][bx4]][t->a->filter[0][bx4]];
1734                 for (int pl = 0; pl < 2; pl++) {
1735                     res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off, NULL,
1736                              f->cur.stride[1], bw4, bh4, t->bx, t->by - 1,
1737                              1 + pl, r[-1][t->bx].mv.mv[0],
1738                              &f->refp[r[-1][t->bx].ref.ref[0] - 1],
1739                              r[-1][t->bx].ref.ref[0] - 1,
1740                              t->frame_thread.pass != 2 ? top_filter_2d :
1741                                  f->frame_thread.b[((t->by - 1) * f->b4_stride) + t->bx].filter2d);
1742                     if (res) return res;
1743                 }
1744                 v_off = 2 * PXSTRIDE(f->cur.stride[1]);
1745             }
1746             for (int pl = 0; pl < 2; pl++) {
1747                 res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff + h_off + v_off, NULL, f->cur.stride[1],
1748                          bw4, bh4, t->bx, t->by, 1 + pl, b->mv[0],
1749                          refp, b->ref[0], filter_2d);
1750                 if (res) return res;
1751             }
1752         } else {
1753             if (imin(cbw4, cbh4) > 1 &&
1754                 ((b->inter_mode == GLOBALMV && f->gmv_warp_allowed[b->ref[0]]) ||
1755                  (b->motion_mode == MM_WARP && t->warpmv.type > DAV1D_WM_TYPE_TRANSLATION)))
1756             {
1757                 for (int pl = 0; pl < 2; pl++) {
1758                     res = warp_affine(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff, NULL,
1759                                       f->cur.stride[1], b_dim, 1 + pl, refp,
1760                                       b->motion_mode == MM_WARP ? &t->warpmv :
1761                                           &f->frame_hdr->gmv[b->ref[0]]);
1762                     if (res) return res;
1763                 }
1764             } else {
1765                 for (int pl = 0; pl < 2; pl++) {
1766                     res = mc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1767                              NULL, f->cur.stride[1],
1768                              bw4 << (bw4 == ss_hor), bh4 << (bh4 == ss_ver),
1769                              t->bx & ~ss_hor, t->by & ~ss_ver,
1770                              1 + pl, b->mv[0], refp, b->ref[0], filter_2d);
1771                     if (res) return res;
1772                     if (b->motion_mode == MM_OBMC) {
1773                         res = obmc(t, ((pixel *) f->cur.data[1 + pl]) + uvdstoff,
1774                                    f->cur.stride[1], b_dim, 1 + pl, bx4, by4, w4, h4);
1775                         if (res) return res;
1776                     }
1777                 }
1778             }
1779             if (b->interintra_type) {
1780                 // FIXME for 8x32 with 4:2:2 subsampling, this probably does
1781                 // the wrong thing since it will select 4x16, not 4x32, as a
1782                 // transform size...
1783                 const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
1784 
1785                 for (int pl = 0; pl < 2; pl++) {
1786                     pixel *const tmp = bitfn(t->scratch.interintra);
1787                     pixel *const tl_edge = bitfn(t->scratch.edge) + 32;
1788                     enum IntraPredMode m =
1789                         b->interintra_mode == II_SMOOTH_PRED ?
1790                         SMOOTH_PRED : b->interintra_mode;
1791                     int angle = 0;
1792                     pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1793                     const pixel *top_sb_edge = NULL;
1794                     if (!(t->by & (f->sb_step - 1))) {
1795                         top_sb_edge = f->ipred_edge[pl + 1];
1796                         const int sby = t->by >> f->sb_shift;
1797                         top_sb_edge += f->sb128w * 128 * (sby - 1);
1798                     }
1799                     m = bytefn(dav1d_prepare_intra_edges)(t->bx >> ss_hor,
1800                                                           (t->bx >> ss_hor) >
1801                                                               (ts->tiling.col_start >> ss_hor),
1802                                                           t->by >> ss_ver,
1803                                                           (t->by >> ss_ver) >
1804                                                               (ts->tiling.row_start >> ss_ver),
1805                                                           ts->tiling.col_end >> ss_hor,
1806                                                           ts->tiling.row_end >> ss_ver,
1807                                                           0, uvdst, f->cur.stride[1],
1808                                                           top_sb_edge, m,
1809                                                           &angle, cbw4, cbh4, 0, tl_edge
1810                                                           HIGHBD_CALL_SUFFIX);
1811                     dsp->ipred.intra_pred[m](tmp, cbw4 * 4 * sizeof(pixel),
1812                                              tl_edge, cbw4 * 4, cbh4 * 4, 0, 0, 0
1813                                              HIGHBD_CALL_SUFFIX);
1814                     dsp->mc.blend(uvdst, f->cur.stride[1], tmp,
1815                                   cbw4 * 4, cbh4 * 4, ii_mask);
1816                 }
1817             }
1818         }
1819 
1820     skip_inter_chroma_pred: {}
1821         t->tl_4x4_filter = filter_2d;
1822     } else {
1823         const enum Filter2d filter_2d = b->filter2d;
1824         // Maximum super block size is 128x128
1825         int16_t (*tmp)[128 * 128] = t->scratch.compinter;
1826         int jnt_weight;
1827         uint8_t *const seg_mask = t->scratch.seg_mask;
1828         const uint8_t *mask;
1829 
1830         for (int i = 0; i < 2; i++) {
1831             const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1832 
1833             if (b->inter_mode == GLOBALMV_GLOBALMV && f->gmv_warp_allowed[b->ref[i]]) {
1834                 res = warp_affine(t, NULL, tmp[i], bw4 * 4, b_dim, 0, refp,
1835                                   &f->frame_hdr->gmv[b->ref[i]]);
1836                 if (res) return res;
1837             } else {
1838                 res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by, 0,
1839                          b->mv[i], refp, b->ref[i], filter_2d);
1840                 if (res) return res;
1841             }
1842         }
1843         switch (b->comp_type) {
1844         case COMP_INTER_AVG:
1845             dsp->mc.avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1846                         bw4 * 4, bh4 * 4 HIGHBD_CALL_SUFFIX);
1847             break;
1848         case COMP_INTER_WEIGHTED_AVG:
1849             jnt_weight = f->jnt_weights[b->ref[0]][b->ref[1]];
1850             dsp->mc.w_avg(dst, f->cur.stride[0], tmp[0], tmp[1],
1851                           bw4 * 4, bh4 * 4, jnt_weight HIGHBD_CALL_SUFFIX);
1852             break;
1853         case COMP_INTER_SEG:
1854             dsp->mc.w_mask[chr_layout_idx](dst, f->cur.stride[0],
1855                                            tmp[b->mask_sign], tmp[!b->mask_sign],
1856                                            bw4 * 4, bh4 * 4, seg_mask,
1857                                            b->mask_sign HIGHBD_CALL_SUFFIX);
1858             mask = seg_mask;
1859             break;
1860         case COMP_INTER_WEDGE:
1861             mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
1862             dsp->mc.mask(dst, f->cur.stride[0],
1863                          tmp[b->mask_sign], tmp[!b->mask_sign],
1864                          bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
1865             if (has_chroma)
1866                 mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
1867             break;
1868         }
1869 
1870         // chroma
1871         if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1872             for (int i = 0; i < 2; i++) {
1873                 const Dav1dThreadPicture *const refp = &f->refp[b->ref[i]];
1874                 if (b->inter_mode == GLOBALMV_GLOBALMV &&
1875                     imin(cbw4, cbh4) > 1 && f->gmv_warp_allowed[b->ref[i]])
1876                 {
1877                     res = warp_affine(t, NULL, tmp[i], bw4 * 4 >> ss_hor,
1878                                       b_dim, 1 + pl,
1879                                       refp, &f->frame_hdr->gmv[b->ref[i]]);
1880                     if (res) return res;
1881                 } else {
1882                     res = mc(t, NULL, tmp[i], 0, bw4, bh4, t->bx, t->by,
1883                              1 + pl, b->mv[i], refp, b->ref[i], filter_2d);
1884                     if (res) return res;
1885                 }
1886             }
1887             pixel *const uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff;
1888             switch (b->comp_type) {
1889             case COMP_INTER_AVG:
1890                 dsp->mc.avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1891                             bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver
1892                             HIGHBD_CALL_SUFFIX);
1893                 break;
1894             case COMP_INTER_WEIGHTED_AVG:
1895                 dsp->mc.w_avg(uvdst, f->cur.stride[1], tmp[0], tmp[1],
1896                               bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, jnt_weight
1897                               HIGHBD_CALL_SUFFIX);
1898                 break;
1899             case COMP_INTER_WEDGE:
1900             case COMP_INTER_SEG:
1901                 dsp->mc.mask(uvdst, f->cur.stride[1],
1902                              tmp[b->mask_sign], tmp[!b->mask_sign],
1903                              bw4 * 4 >> ss_hor, bh4 * 4 >> ss_ver, mask
1904                              HIGHBD_CALL_SUFFIX);
1905                 break;
1906             }
1907         }
1908     }
1909 
1910     if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS) {
1911         hex_dump(dst, f->cur.stride[0], b_dim[0] * 4, b_dim[1] * 4, "y-pred");
1912         if (has_chroma) {
1913             hex_dump(&((pixel *) f->cur.data[1])[uvdstoff], f->cur.stride[1],
1914                      cbw4 * 4, cbh4 * 4, "u-pred");
1915             hex_dump(&((pixel *) f->cur.data[2])[uvdstoff], f->cur.stride[1],
1916                      cbw4 * 4, cbh4 * 4, "v-pred");
1917         }
1918     }
1919 
1920     const int cw4 = (w4 + ss_hor) >> ss_hor, ch4 = (h4 + ss_ver) >> ss_ver;
1921 
1922     if (b->skip) {
1923         // reset coef contexts
1924 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
1925         rep_macro(type, t->dir lcoef, off, mul * 0x40)
1926         case_set(bh4, l., 1, by4);
1927         case_set(bw4, a->, 0, bx4);
1928 #undef set_ctx
1929         if (has_chroma) {
1930 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
1931             rep_macro(type, t->dir ccoef[0], off, mul * 0x40); \
1932             rep_macro(type, t->dir ccoef[1], off, mul * 0x40)
1933             case_set(cbh4, l., 1, cby4);
1934             case_set(cbw4, a->, 0, cbx4);
1935 #undef set_ctx
1936         }
1937         return 0;
1938     }
1939 
1940     const TxfmInfo *const uvtx = &dav1d_txfm_dimensions[b->uvtx];
1941     const TxfmInfo *const ytx = &dav1d_txfm_dimensions[b->max_ytx];
1942     const uint16_t tx_split[2] = { b->tx_split0, b->tx_split1 };
1943 
1944     for (int init_y = 0; init_y < bh4; init_y += 16) {
1945         for (int init_x = 0; init_x < bw4; init_x += 16) {
1946             // coefficient coding & inverse transforms
1947             int y_off = !!init_y, y;
1948             dst += PXSTRIDE(f->cur.stride[0]) * 4 * init_y;
1949             for (y = init_y, t->by += init_y; y < imin(h4, init_y + 16);
1950                  y += ytx->h, y_off++)
1951             {
1952                 int x, x_off = !!init_x;
1953                 for (x = init_x, t->bx += init_x; x < imin(w4, init_x + 16);
1954                      x += ytx->w, x_off++)
1955                 {
1956                     read_coef_tree(t, bs, b, b->max_ytx, 0, tx_split,
1957                                    x_off, y_off, &dst[x * 4]);
1958                     t->bx += ytx->w;
1959                 }
1960                 dst += PXSTRIDE(f->cur.stride[0]) * 4 * ytx->h;
1961                 t->bx -= x;
1962                 t->by += ytx->h;
1963             }
1964             dst -= PXSTRIDE(f->cur.stride[0]) * 4 * y;
1965             t->by -= y;
1966 
1967             // chroma coefs and inverse transform
1968             if (has_chroma) for (int pl = 0; pl < 2; pl++) {
1969                 pixel *uvdst = ((pixel *) f->cur.data[1 + pl]) + uvdstoff +
1970                     (PXSTRIDE(f->cur.stride[1]) * init_y * 4 >> ss_ver);
1971                 for (y = init_y >> ss_ver, t->by += init_y;
1972                      y < imin(ch4, (init_y + 16) >> ss_ver); y += uvtx->h)
1973                 {
1974                     int x;
1975                     for (x = init_x >> ss_hor, t->bx += init_x;
1976                          x < imin(cw4, (init_x + 16) >> ss_hor); x += uvtx->w)
1977                     {
1978                         coef *cf;
1979                         int eob;
1980                         enum TxfmType txtp;
1981                         if (t->frame_thread.pass) {
1982                             const int p = t->frame_thread.pass & 1;
1983                             const int cbi = *ts->frame_thread[p].cbi++;
1984                             cf = ts->frame_thread[p].cf;
1985                             ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
1986                             eob  = cbi >> 5;
1987                             txtp = cbi & 0x1f;
1988                         } else {
1989                             uint8_t cf_ctx;
1990                             cf = bitfn(t->cf);
1991                             txtp = t->scratch.txtp_map[(by4 + (y << ss_ver)) * 32 +
1992                                                         bx4 + (x << ss_hor)];
1993                             eob = decode_coefs(t, &t->a->ccoef[pl][cbx4 + x],
1994                                                &t->l.ccoef[pl][cby4 + y],
1995                                                b->uvtx, bs, b, 0, 1 + pl,
1996                                                cf, &txtp, &cf_ctx);
1997                             if (DEBUG_BLOCK_INFO)
1998                                 printf("Post-uv-cf-blk[pl=%d,tx=%d,"
1999                                        "txtp=%d,eob=%d]: r=%d\n",
2000                                        pl, b->uvtx, txtp, eob, ts->msac.rng);
2001 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
2002                             rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
2003 #define default_memset(dir, diridx, off, sz) \
2004                             memset(&t->dir ccoef[pl][off], cf_ctx, sz)
2005                             case_set_upto16_with_default( \
2006                                      imin(uvtx->h, (f->bh - t->by + ss_ver) >> ss_ver),
2007                                      l., 1, cby4 + y);
2008                             case_set_upto16_with_default( \
2009                                      imin(uvtx->w, (f->bw - t->bx + ss_hor) >> ss_hor),
2010                                      a->, 0, cbx4 + x);
2011 #undef default_memset
2012 #undef set_ctx
2013                         }
2014                         if (eob >= 0) {
2015                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
2016                                 coef_dump(cf, uvtx->h * 4, uvtx->w * 4, 3, "dq");
2017                             dsp->itx.itxfm_add[b->uvtx]
2018                                               [txtp](&uvdst[4 * x],
2019                                                      f->cur.stride[1],
2020                                                      cf, eob HIGHBD_CALL_SUFFIX);
2021                             if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
2022                                 hex_dump(&uvdst[4 * x], f->cur.stride[1],
2023                                          uvtx->w * 4, uvtx->h * 4, "recon");
2024                         }
2025                         t->bx += uvtx->w << ss_hor;
2026                     }
2027                     uvdst += PXSTRIDE(f->cur.stride[1]) * 4 * uvtx->h;
2028                     t->bx -= x << ss_hor;
2029                     t->by += uvtx->h << ss_ver;
2030                 }
2031                 t->by -= y << ss_ver;
2032             }
2033         }
2034     }
2035     return 0;
2036 }
2037 
bytefn(dav1d_filter_sbrow_deblock_cols)2038 void bytefn(dav1d_filter_sbrow_deblock_cols)(Dav1dFrameContext *const f, const int sby) {
2039     if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK) ||
2040         (!f->frame_hdr->loopfilter.level_y[0] && !f->frame_hdr->loopfilter.level_y[1]))
2041     {
2042         return;
2043     }
2044     const int y = sby * f->sb_step * 4;
2045     const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2046     pixel *const p[3] = {
2047         f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2048         f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2049         f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2050     };
2051     Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2052     bytefn(dav1d_loopfilter_sbrow_cols)(f, p, mask, sby,
2053                                         f->lf.start_of_tile_row[sby]);
2054 }
2055 
bytefn(dav1d_filter_sbrow_deblock_rows)2056 void bytefn(dav1d_filter_sbrow_deblock_rows)(Dav1dFrameContext *const f, const int sby) {
2057     const int y = sby * f->sb_step * 4;
2058     const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2059     pixel *const p[3] = {
2060         f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2061         f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2062         f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2063     };
2064     Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2065     if (f->c->inloop_filters & DAV1D_INLOOPFILTER_DEBLOCK &&
2066         (f->frame_hdr->loopfilter.level_y[0] || f->frame_hdr->loopfilter.level_y[1]))
2067     {
2068         bytefn(dav1d_loopfilter_sbrow_rows)(f, p, mask, sby);
2069     }
2070     if (f->seq_hdr->cdef || f->lf.restore_planes) {
2071         // Store loop filtered pixels required by CDEF / LR
2072         bytefn(dav1d_copy_lpf)(f, p, sby);
2073     }
2074 }
2075 
bytefn(dav1d_filter_sbrow_cdef)2076 void bytefn(dav1d_filter_sbrow_cdef)(Dav1dTaskContext *const tc, const int sby) {
2077     const Dav1dFrameContext *const f = tc->f;
2078     if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_CDEF)) return;
2079     const int sbsz = f->sb_step;
2080     const int y = sby * sbsz * 4;
2081     const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2082     pixel *const p[3] = {
2083         f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2084         f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2085         f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2086     };
2087     Av1Filter *prev_mask = f->lf.mask + ((sby - 1) >> !f->seq_hdr->sb128) * f->sb128w;
2088     Av1Filter *mask = f->lf.mask + (sby >> !f->seq_hdr->sb128) * f->sb128w;
2089     const int start = sby * sbsz;
2090     if (sby) {
2091         const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2092         pixel *p_up[3] = {
2093             p[0] - 8 * PXSTRIDE(f->cur.stride[0]),
2094             p[1] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2095             p[2] - (8 * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2096         };
2097         bytefn(dav1d_cdef_brow)(tc, p_up, prev_mask, start - 2, start, 1, sby);
2098     }
2099     const int n_blks = sbsz - 2 * (sby + 1 < f->sbh);
2100     const int end = imin(start + n_blks, f->bh);
2101     bytefn(dav1d_cdef_brow)(tc, p, mask, start, end, 0, sby);
2102 }
2103 
bytefn(dav1d_filter_sbrow_resize)2104 void bytefn(dav1d_filter_sbrow_resize)(Dav1dFrameContext *const f, const int sby) {
2105     const int sbsz = f->sb_step;
2106     const int y = sby * sbsz * 4;
2107     const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2108     const pixel *const p[3] = {
2109         f->lf.p[0] + y * PXSTRIDE(f->cur.stride[0]),
2110         f->lf.p[1] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver),
2111         f->lf.p[2] + (y * PXSTRIDE(f->cur.stride[1]) >> ss_ver)
2112     };
2113     pixel *const sr_p[3] = {
2114         f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2115         f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2116         f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2117     };
2118     const int has_chroma = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400;
2119     for (int pl = 0; pl < 1 + 2 * has_chroma; pl++) {
2120         const int ss_ver = pl && f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2121         const int h_start = 8 * !!sby >> ss_ver;
2122         const ptrdiff_t dst_stride = f->sr_cur.p.stride[!!pl];
2123         pixel *dst = sr_p[pl] - h_start * PXSTRIDE(dst_stride);
2124         const ptrdiff_t src_stride = f->cur.stride[!!pl];
2125         const pixel *src = p[pl] - h_start * PXSTRIDE(src_stride);
2126         const int h_end = 4 * (sbsz - 2 * (sby + 1 < f->sbh)) >> ss_ver;
2127         const int ss_hor = pl && f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2128         const int dst_w = (f->sr_cur.p.p.w + ss_hor) >> ss_hor;
2129         const int src_w = (4 * f->bw + ss_hor) >> ss_hor;
2130         const int img_h = (f->cur.p.h - sbsz * 4 * sby + ss_ver) >> ss_ver;
2131 
2132         f->dsp->mc.resize(dst, dst_stride, src, src_stride, dst_w,
2133                           imin(img_h, h_end) + h_start, src_w,
2134                           f->resize_step[!!pl], f->resize_start[!!pl]
2135                           HIGHBD_CALL_SUFFIX);
2136     }
2137 }
2138 
bytefn(dav1d_filter_sbrow_lr)2139 void bytefn(dav1d_filter_sbrow_lr)(Dav1dFrameContext *const f, const int sby) {
2140     if (!(f->c->inloop_filters & DAV1D_INLOOPFILTER_RESTORATION)) return;
2141     const int y = sby * f->sb_step * 4;
2142     const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2143     pixel *const sr_p[3] = {
2144         f->lf.sr_p[0] + y * PXSTRIDE(f->sr_cur.p.stride[0]),
2145         f->lf.sr_p[1] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver),
2146         f->lf.sr_p[2] + (y * PXSTRIDE(f->sr_cur.p.stride[1]) >> ss_ver)
2147     };
2148     bytefn(dav1d_lr_sbrow)(f, sr_p, sby);
2149 }
2150 
bytefn(dav1d_filter_sbrow)2151 void bytefn(dav1d_filter_sbrow)(Dav1dFrameContext *const f, const int sby) {
2152     bytefn(dav1d_filter_sbrow_deblock_cols)(f, sby);
2153     bytefn(dav1d_filter_sbrow_deblock_rows)(f, sby);
2154     if (f->seq_hdr->cdef)
2155         bytefn(dav1d_filter_sbrow_cdef)(f->c->tc, sby);
2156     if (f->frame_hdr->width[0] != f->frame_hdr->width[1])
2157         bytefn(dav1d_filter_sbrow_resize)(f, sby);
2158     if (f->lf.restore_planes)
2159         bytefn(dav1d_filter_sbrow_lr)(f, sby);
2160 }
2161 
bytefn(dav1d_backup_ipred_edge)2162 void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
2163     const Dav1dFrameContext *const f = t->f;
2164     Dav1dTileState *const ts = t->ts;
2165     const int sby = t->by >> f->sb_shift;
2166     const int sby_off = f->sb128w * 128 * sby;
2167     const int x_off = ts->tiling.col_start;
2168 
2169     const pixel *const y =
2170         ((const pixel *) f->cur.data[0]) + x_off * 4 +
2171                     ((t->by + f->sb_step) * 4 - 1) * PXSTRIDE(f->cur.stride[0]);
2172     pixel_copy(&f->ipred_edge[0][sby_off + x_off * 4], y,
2173                4 * (ts->tiling.col_end - x_off));
2174 
2175     if (f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I400) {
2176         const int ss_ver = f->cur.p.layout == DAV1D_PIXEL_LAYOUT_I420;
2177         const int ss_hor = f->cur.p.layout != DAV1D_PIXEL_LAYOUT_I444;
2178 
2179         const ptrdiff_t uv_off = (x_off * 4 >> ss_hor) +
2180             (((t->by + f->sb_step) * 4 >> ss_ver) - 1) * PXSTRIDE(f->cur.stride[1]);
2181         for (int pl = 1; pl <= 2; pl++)
2182             pixel_copy(&f->ipred_edge[pl][sby_off + (x_off * 4 >> ss_hor)],
2183                        &((const pixel *) f->cur.data[pl])[uv_off],
2184                        4 * (ts->tiling.col_end - x_off) >> ss_hor);
2185     }
2186 }
2187 
bytefn(dav1d_copy_pal_block_y)2188 void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
2189                                     const int bx4, const int by4,
2190                                     const int bw4, const int bh4)
2191 
2192 {
2193     const Dav1dFrameContext *const f = t->f;
2194     pixel *const pal = t->frame_thread.pass ?
2195         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2196                             ((t->bx >> 1) + (t->by & 1))][0] :
2197         bytefn(t->scratch.pal)[0];
2198     for (int x = 0; x < bw4; x++)
2199         memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
2200     for (int y = 0; y < bh4; y++)
2201         memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
2202 }
2203 
bytefn(dav1d_copy_pal_block_uv)2204 void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
2205                                      const int bx4, const int by4,
2206                                      const int bw4, const int bh4)
2207 
2208 {
2209     const Dav1dFrameContext *const f = t->f;
2210     const pixel (*const pal)[8] = t->frame_thread.pass ?
2211         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2212                             ((t->bx >> 1) + (t->by & 1))] :
2213         bytefn(t->scratch.pal);
2214     // see aomedia bug 2183 for why we use luma coordinates here
2215     for (int pl = 1; pl <= 2; pl++) {
2216         for (int x = 0; x < bw4; x++)
2217             memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
2218         for (int y = 0; y < bh4; y++)
2219             memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
2220     }
2221 }
2222 
bytefn(dav1d_read_pal_plane)2223 void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
2224                                   const int pl, const int sz_ctx,
2225                                   const int bx4, const int by4)
2226 {
2227     Dav1dTileState *const ts = t->ts;
2228     const Dav1dFrameContext *const f = t->f;
2229     const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
2230                                            ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
2231     pixel cache[16], used_cache[8];
2232     int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
2233     int n_cache = 0;
2234     // don't reuse above palette outside SB64 boundaries
2235     int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
2236     const pixel *l = bytefn(t->al_pal)[1][by4][pl];
2237     const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
2238 
2239     // fill/sort cache
2240     while (l_cache && a_cache) {
2241         if (*l < *a) {
2242             if (!n_cache || cache[n_cache - 1] != *l)
2243                 cache[n_cache++] = *l;
2244             l++;
2245             l_cache--;
2246         } else {
2247             if (*a == *l) {
2248                 l++;
2249                 l_cache--;
2250             }
2251             if (!n_cache || cache[n_cache - 1] != *a)
2252                 cache[n_cache++] = *a;
2253             a++;
2254             a_cache--;
2255         }
2256     }
2257     if (l_cache) {
2258         do {
2259             if (!n_cache || cache[n_cache - 1] != *l)
2260                 cache[n_cache++] = *l;
2261             l++;
2262         } while (--l_cache > 0);
2263     } else if (a_cache) {
2264         do {
2265             if (!n_cache || cache[n_cache - 1] != *a)
2266                 cache[n_cache++] = *a;
2267             a++;
2268         } while (--a_cache > 0);
2269     }
2270 
2271     // find reused cache entries
2272     int i = 0;
2273     for (int n = 0; n < n_cache && i < pal_sz; n++)
2274         if (dav1d_msac_decode_bool_equi(&ts->msac))
2275             used_cache[i++] = cache[n];
2276     const int n_used_cache = i;
2277 
2278     // parse new entries
2279     pixel *const pal = t->frame_thread.pass ?
2280         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2281                             ((t->bx >> 1) + (t->by & 1))][pl] :
2282         bytefn(t->scratch.pal)[pl];
2283     if (i < pal_sz) {
2284         const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2285         int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
2286 
2287         if (i < pal_sz) {
2288             int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
2289             const int max = (1 << bpc) - 1;
2290 
2291             do {
2292                 const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2293                 prev = pal[i++] = imin(prev + delta + !pl, max);
2294                 if (prev + !pl >= max) {
2295                     for (; i < pal_sz; i++)
2296                         pal[i] = max;
2297                     break;
2298                 }
2299                 bits = imin(bits, 1 + ulog2(max - prev - !pl));
2300             } while (i < pal_sz);
2301         }
2302 
2303         // merge cache+new entries
2304         int n = 0, m = n_used_cache;
2305         for (i = 0; i < pal_sz; i++) {
2306             if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
2307                 pal[i] = used_cache[n++];
2308             } else {
2309                 assert(m < pal_sz);
2310                 pal[i] = pal[m++];
2311             }
2312         }
2313     } else {
2314         memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
2315     }
2316 
2317     if (DEBUG_BLOCK_INFO) {
2318         printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
2319                pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
2320         for (int n = 0; n < n_cache; n++)
2321             printf("%c%02x", n ? ' ' : '[', cache[n]);
2322         printf("%s, pal=", n_cache ? "]" : "[]");
2323         for (int n = 0; n < pal_sz; n++)
2324             printf("%c%02x", n ? ' ' : '[', pal[n]);
2325         printf("]\n");
2326     }
2327 }
2328 
bytefn(dav1d_read_pal_uv)2329 void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
2330                                const int sz_ctx, const int bx4, const int by4)
2331 {
2332     bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
2333 
2334     // V pal coding
2335     Dav1dTileState *const ts = t->ts;
2336     const Dav1dFrameContext *const f = t->f;
2337     pixel *const pal = t->frame_thread.pass ?
2338         f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
2339                             ((t->bx >> 1) + (t->by & 1))][2] :
2340         bytefn(t->scratch.pal)[2];
2341     const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
2342     if (dav1d_msac_decode_bool_equi(&ts->msac)) {
2343         const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
2344         int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
2345         const int max = (1 << bpc) - 1;
2346         for (int i = 1; i < b->pal_sz[1]; i++) {
2347             int delta = dav1d_msac_decode_bools(&ts->msac, bits);
2348             if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
2349             prev = pal[i] = (prev + delta) & max;
2350         }
2351     } else {
2352         for (int i = 0; i < b->pal_sz[1]; i++)
2353             pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
2354     }
2355     if (DEBUG_BLOCK_INFO) {
2356         printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
2357         for (int n = 0; n < b->pal_sz[1]; n++)
2358             printf("%c%02x", n ? ' ' : '[', pal[n]);
2359         printf("]\n");
2360     }
2361 }
2362