• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/avassert.h"
25 
26 #include "threadframe.h"
27 #include "vp56.h"
28 #include "vp9.h"
29 #include "vp9data.h"
30 #include "vp9dec.h"
31 
setctx_2d(uint8_t * ptr,int w,int h,ptrdiff_t stride,int v)32 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
33                                        ptrdiff_t stride, int v)
34 {
35     switch (w) {
36     case 1:
37         do {
38             *ptr = v;
39             ptr += stride;
40         } while (--h);
41         break;
42     case 2: {
43         int v16 = v * 0x0101;
44         do {
45             AV_WN16A(ptr, v16);
46             ptr += stride;
47         } while (--h);
48         break;
49     }
50     case 4: {
51         uint32_t v32 = v * 0x01010101;
52         do {
53             AV_WN32A(ptr, v32);
54             ptr += stride;
55         } while (--h);
56         break;
57     }
58     case 8: {
59 #if HAVE_FAST_64BIT
60         uint64_t v64 = v * 0x0101010101010101ULL;
61         do {
62             AV_WN64A(ptr, v64);
63             ptr += stride;
64         } while (--h);
65 #else
66         uint32_t v32 = v * 0x01010101;
67         do {
68             AV_WN32A(ptr,     v32);
69             AV_WN32A(ptr + 4, v32);
70             ptr += stride;
71         } while (--h);
72 #endif
73         break;
74     }
75     }
76 }
77 
decode_mode(VP9TileData * td)78 static void decode_mode(VP9TileData *td)
79 {
80     static const uint8_t left_ctx[N_BS_SIZES] = {
81         0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
82     };
83     static const uint8_t above_ctx[N_BS_SIZES] = {
84         0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
85     };
86     static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
87         TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
88         TX_16X16, TX_8X8,   TX_8X8,   TX_8X8,   TX_4X4,   TX_4X4,  TX_4X4
89     };
90     VP9Context *s = td->s;
91     VP9Block *b = td->b;
92     int row = td->row, col = td->col, row7 = td->row7;
93     enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
94     int bw4 = ff_vp9_bwh_tab[1][b->bs][0], w4 = FFMIN(s->cols - col, bw4);
95     int bh4 = ff_vp9_bwh_tab[1][b->bs][1], h4 = FFMIN(s->rows - row, bh4), y;
96     int have_a = row > 0, have_l = col > td->tile_col_start;
97     int vref, filter_id;
98 
99     if (!s->s.h.segmentation.enabled) {
100         b->seg_id = 0;
101     } else if (s->s.h.keyframe || s->s.h.intraonly) {
102         b->seg_id = !s->s.h.segmentation.update_map ? 0 :
103                     vp8_rac_get_tree(td->c, ff_vp9_segmentation_tree, s->s.h.segmentation.prob);
104     } else if (!s->s.h.segmentation.update_map ||
105                (s->s.h.segmentation.temporal &&
106                 vp56_rac_get_prob_branchy(td->c,
107                     s->s.h.segmentation.pred_prob[s->above_segpred_ctx[col] +
108                                     td->left_segpred_ctx[row7]]))) {
109         if (!s->s.h.errorres && s->s.frames[REF_FRAME_SEGMAP].segmentation_map) {
110             int pred = 8, x;
111             uint8_t *refsegmap = s->s.frames[REF_FRAME_SEGMAP].segmentation_map;
112 
113             if (!s->s.frames[REF_FRAME_SEGMAP].uses_2pass)
114                 ff_thread_await_progress(&s->s.frames[REF_FRAME_SEGMAP].tf, row >> 3, 0);
115             for (y = 0; y < h4; y++) {
116                 int idx_base = (y + row) * 8 * s->sb_cols + col;
117                 for (x = 0; x < w4; x++)
118                     pred = FFMIN(pred, refsegmap[idx_base + x]);
119             }
120             av_assert1(pred < 8);
121             b->seg_id = pred;
122         } else {
123             b->seg_id = 0;
124         }
125 
126         memset(&s->above_segpred_ctx[col], 1, w4);
127         memset(&td->left_segpred_ctx[row7], 1, h4);
128     } else {
129         b->seg_id = vp8_rac_get_tree(td->c, ff_vp9_segmentation_tree,
130                                      s->s.h.segmentation.prob);
131 
132         memset(&s->above_segpred_ctx[col], 0, w4);
133         memset(&td->left_segpred_ctx[row7], 0, h4);
134     }
135     if (s->s.h.segmentation.enabled &&
136         (s->s.h.segmentation.update_map || s->s.h.keyframe || s->s.h.intraonly)) {
137         setctx_2d(&s->s.frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
138                   bw4, bh4, 8 * s->sb_cols, b->seg_id);
139     }
140 
141     b->skip = s->s.h.segmentation.enabled &&
142         s->s.h.segmentation.feat[b->seg_id].skip_enabled;
143     if (!b->skip) {
144         int c = td->left_skip_ctx[row7] + s->above_skip_ctx[col];
145         b->skip = vp56_rac_get_prob(td->c, s->prob.p.skip[c]);
146         td->counts.skip[c][b->skip]++;
147     }
148 
149     if (s->s.h.keyframe || s->s.h.intraonly) {
150         b->intra = 1;
151     } else if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
152         b->intra = !s->s.h.segmentation.feat[b->seg_id].ref_val;
153     } else {
154         int c, bit;
155 
156         if (have_a && have_l) {
157             c = s->above_intra_ctx[col] + td->left_intra_ctx[row7];
158             c += (c == 2);
159         } else {
160             c = have_a ? 2 * s->above_intra_ctx[col] :
161                 have_l ? 2 * td->left_intra_ctx[row7] : 0;
162         }
163         bit = vp56_rac_get_prob(td->c, s->prob.p.intra[c]);
164         td->counts.intra[c][bit]++;
165         b->intra = !bit;
166     }
167 
168     if ((b->intra || !b->skip) && s->s.h.txfmmode == TX_SWITCHABLE) {
169         int c;
170         if (have_a) {
171             if (have_l) {
172                 c = (s->above_skip_ctx[col] ? max_tx :
173                      s->above_txfm_ctx[col]) +
174                     (td->left_skip_ctx[row7] ? max_tx :
175                      td->left_txfm_ctx[row7]) > max_tx;
176             } else {
177                 c = s->above_skip_ctx[col] ? 1 :
178                     (s->above_txfm_ctx[col] * 2 > max_tx);
179             }
180         } else if (have_l) {
181             c = td->left_skip_ctx[row7] ? 1 :
182                 (td->left_txfm_ctx[row7] * 2 > max_tx);
183         } else {
184             c = 1;
185         }
186         switch (max_tx) {
187         case TX_32X32:
188             b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][0]);
189             if (b->tx) {
190                 b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][1]);
191                 if (b->tx == 2)
192                     b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx32p[c][2]);
193             }
194             td->counts.tx32p[c][b->tx]++;
195             break;
196         case TX_16X16:
197             b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx16p[c][0]);
198             if (b->tx)
199                 b->tx += vp56_rac_get_prob(td->c, s->prob.p.tx16p[c][1]);
200             td->counts.tx16p[c][b->tx]++;
201             break;
202         case TX_8X8:
203             b->tx = vp56_rac_get_prob(td->c, s->prob.p.tx8p[c]);
204             td->counts.tx8p[c][b->tx]++;
205             break;
206         case TX_4X4:
207             b->tx = TX_4X4;
208             break;
209         }
210     } else {
211         b->tx = FFMIN(max_tx, s->s.h.txfmmode);
212     }
213 
214     if (s->s.h.keyframe || s->s.h.intraonly) {
215         uint8_t *a = &s->above_mode_ctx[col * 2];
216         uint8_t *l = &td->left_mode_ctx[(row7) << 1];
217 
218         b->comp = 0;
219         if (b->bs > BS_8x8) {
220             // FIXME the memory storage intermediates here aren't really
221             // necessary, they're just there to make the code slightly
222             // simpler for now
223             b->mode[0] =
224             a[0]       = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
225                                           ff_vp9_default_kf_ymode_probs[a[0]][l[0]]);
226             if (b->bs != BS_8x4) {
227                 b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
228                                               ff_vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
229                 l[0]       =
230                 a[1]       = b->mode[1];
231             } else {
232                 l[0]       =
233                 a[1]       =
234                 b->mode[1] = b->mode[0];
235             }
236             if (b->bs != BS_4x8) {
237                 b->mode[2] =
238                 a[0]       = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
239                                               ff_vp9_default_kf_ymode_probs[a[0]][l[1]]);
240                 if (b->bs != BS_8x4) {
241                     b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
242                                                   ff_vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
243                     l[1]       =
244                     a[1]       = b->mode[3];
245                 } else {
246                     l[1]       =
247                     a[1]       =
248                     b->mode[3] = b->mode[2];
249                 }
250             } else {
251                 b->mode[2] = b->mode[0];
252                 l[1]       =
253                 a[1]       =
254                 b->mode[3] = b->mode[1];
255             }
256         } else {
257             b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
258                                           ff_vp9_default_kf_ymode_probs[*a][*l]);
259             b->mode[3] =
260             b->mode[2] =
261             b->mode[1] = b->mode[0];
262             // FIXME this can probably be optimized
263             memset(a, b->mode[0], ff_vp9_bwh_tab[0][b->bs][0]);
264             memset(l, b->mode[0], ff_vp9_bwh_tab[0][b->bs][1]);
265         }
266         b->uvmode = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
267                                      ff_vp9_default_kf_uvmode_probs[b->mode[3]]);
268     } else if (b->intra) {
269         b->comp = 0;
270         if (b->bs > BS_8x8) {
271             b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
272                                           s->prob.p.y_mode[0]);
273             td->counts.y_mode[0][b->mode[0]]++;
274             if (b->bs != BS_8x4) {
275                 b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
276                                               s->prob.p.y_mode[0]);
277                 td->counts.y_mode[0][b->mode[1]]++;
278             } else {
279                 b->mode[1] = b->mode[0];
280             }
281             if (b->bs != BS_4x8) {
282                 b->mode[2] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
283                                               s->prob.p.y_mode[0]);
284                 td->counts.y_mode[0][b->mode[2]]++;
285                 if (b->bs != BS_8x4) {
286                     b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
287                                                   s->prob.p.y_mode[0]);
288                     td->counts.y_mode[0][b->mode[3]]++;
289                 } else {
290                     b->mode[3] = b->mode[2];
291                 }
292             } else {
293                 b->mode[2] = b->mode[0];
294                 b->mode[3] = b->mode[1];
295             }
296         } else {
297             static const uint8_t size_group[10] = {
298                 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
299             };
300             int sz = size_group[b->bs];
301 
302             b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
303                                           s->prob.p.y_mode[sz]);
304             b->mode[1] =
305             b->mode[2] =
306             b->mode[3] = b->mode[0];
307             td->counts.y_mode[sz][b->mode[3]]++;
308         }
309         b->uvmode = vp8_rac_get_tree(td->c, ff_vp9_intramode_tree,
310                                      s->prob.p.uv_mode[b->mode[3]]);
311         td->counts.uv_mode[b->mode[3]][b->uvmode]++;
312     } else {
313         static const uint8_t inter_mode_ctx_lut[14][14] = {
314             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
315             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
316             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
317             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
318             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
319             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
320             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
321             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
322             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
323             { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
324             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
325             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
326             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
327             { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
328         };
329 
330         if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].ref_enabled) {
331             av_assert2(s->s.h.segmentation.feat[b->seg_id].ref_val != 0);
332             b->comp = 0;
333             b->ref[0] = s->s.h.segmentation.feat[b->seg_id].ref_val - 1;
334         } else {
335             // read comp_pred flag
336             if (s->s.h.comppredmode != PRED_SWITCHABLE) {
337                 b->comp = s->s.h.comppredmode == PRED_COMPREF;
338             } else {
339                 int c;
340 
341                 // FIXME add intra as ref=0xff (or -1) to make these easier?
342                 if (have_a) {
343                     if (have_l) {
344                         if (s->above_comp_ctx[col] && td->left_comp_ctx[row7]) {
345                             c = 4;
346                         } else if (s->above_comp_ctx[col]) {
347                             c = 2 + (td->left_intra_ctx[row7] ||
348                                      td->left_ref_ctx[row7] == s->s.h.fixcompref);
349                         } else if (td->left_comp_ctx[row7]) {
350                             c = 2 + (s->above_intra_ctx[col] ||
351                                      s->above_ref_ctx[col] == s->s.h.fixcompref);
352                         } else {
353                             c = (!s->above_intra_ctx[col] &&
354                                  s->above_ref_ctx[col] == s->s.h.fixcompref) ^
355                                 (!td->left_intra_ctx[row7] &&
356                                  td->left_ref_ctx[row & 7] == s->s.h.fixcompref);
357                         }
358                     } else {
359                         c = s->above_comp_ctx[col] ? 3 :
360                         (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->s.h.fixcompref);
361                     }
362                 } else if (have_l) {
363                     c = td->left_comp_ctx[row7] ? 3 :
364                     (!td->left_intra_ctx[row7] && td->left_ref_ctx[row7] == s->s.h.fixcompref);
365                 } else {
366                     c = 1;
367                 }
368                 b->comp = vp56_rac_get_prob(td->c, s->prob.p.comp[c]);
369                 td->counts.comp[c][b->comp]++;
370             }
371 
372             // read actual references
373             // FIXME probably cache a few variables here to prevent repetitive
374             // memory accesses below
375             if (b->comp) { /* two references */
376                 int fix_idx = s->s.h.signbias[s->s.h.fixcompref], var_idx = !fix_idx, c, bit;
377 
378                 b->ref[fix_idx] = s->s.h.fixcompref;
379                 // FIXME can this codeblob be replaced by some sort of LUT?
380                 if (have_a) {
381                     if (have_l) {
382                         if (s->above_intra_ctx[col]) {
383                             if (td->left_intra_ctx[row7]) {
384                                 c = 2;
385                             } else {
386                                 c = 1 + 2 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
387                             }
388                         } else if (td->left_intra_ctx[row7]) {
389                             c = 1 + 2 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
390                         } else {
391                             int refl = td->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
392 
393                             if (refl == refa && refa == s->s.h.varcompref[1]) {
394                                 c = 0;
395                             } else if (!td->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
396                                 if ((refa == s->s.h.fixcompref && refl == s->s.h.varcompref[0]) ||
397                                     (refl == s->s.h.fixcompref && refa == s->s.h.varcompref[0])) {
398                                     c = 4;
399                                 } else {
400                                     c = (refa == refl) ? 3 : 1;
401                                 }
402                             } else if (!td->left_comp_ctx[row7]) {
403                                 if (refa == s->s.h.varcompref[1] && refl != s->s.h.varcompref[1]) {
404                                     c = 1;
405                                 } else {
406                                     c = (refl == s->s.h.varcompref[1] &&
407                                          refa != s->s.h.varcompref[1]) ? 2 : 4;
408                                 }
409                             } else if (!s->above_comp_ctx[col]) {
410                                 if (refl == s->s.h.varcompref[1] && refa != s->s.h.varcompref[1]) {
411                                     c = 1;
412                                 } else {
413                                     c = (refa == s->s.h.varcompref[1] &&
414                                          refl != s->s.h.varcompref[1]) ? 2 : 4;
415                                 }
416                             } else {
417                                 c = (refl == refa) ? 4 : 2;
418                             }
419                         }
420                     } else {
421                         if (s->above_intra_ctx[col]) {
422                             c = 2;
423                         } else if (s->above_comp_ctx[col]) {
424                             c = 4 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
425                         } else {
426                             c = 3 * (s->above_ref_ctx[col] != s->s.h.varcompref[1]);
427                         }
428                     }
429                 } else if (have_l) {
430                     if (td->left_intra_ctx[row7]) {
431                         c = 2;
432                     } else if (td->left_comp_ctx[row7]) {
433                         c = 4 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
434                     } else {
435                         c = 3 * (td->left_ref_ctx[row7] != s->s.h.varcompref[1]);
436                     }
437                 } else {
438                     c = 2;
439                 }
440                 bit = vp56_rac_get_prob(td->c, s->prob.p.comp_ref[c]);
441                 b->ref[var_idx] = s->s.h.varcompref[bit];
442                 td->counts.comp_ref[c][bit]++;
443             } else /* single reference */ {
444                 int bit, c;
445 
446                 if (have_a && !s->above_intra_ctx[col]) {
447                     if (have_l && !td->left_intra_ctx[row7]) {
448                         if (td->left_comp_ctx[row7]) {
449                             if (s->above_comp_ctx[col]) {
450                                 c = 1 + (!s->s.h.fixcompref || !td->left_ref_ctx[row7] ||
451                                          !s->above_ref_ctx[col]);
452                             } else {
453                                 c = (3 * !s->above_ref_ctx[col]) +
454                                     (!s->s.h.fixcompref || !td->left_ref_ctx[row7]);
455                             }
456                         } else if (s->above_comp_ctx[col]) {
457                             c = (3 * !td->left_ref_ctx[row7]) +
458                                 (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
459                         } else {
460                             c = 2 * !td->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
461                         }
462                     } else if (s->above_intra_ctx[col]) {
463                         c = 2;
464                     } else if (s->above_comp_ctx[col]) {
465                         c = 1 + (!s->s.h.fixcompref || !s->above_ref_ctx[col]);
466                     } else {
467                         c = 4 * (!s->above_ref_ctx[col]);
468                     }
469                 } else if (have_l && !td->left_intra_ctx[row7]) {
470                     if (td->left_intra_ctx[row7]) {
471                         c = 2;
472                     } else if (td->left_comp_ctx[row7]) {
473                         c = 1 + (!s->s.h.fixcompref || !td->left_ref_ctx[row7]);
474                     } else {
475                         c = 4 * (!td->left_ref_ctx[row7]);
476                     }
477                 } else {
478                     c = 2;
479                 }
480                 bit = vp56_rac_get_prob(td->c, s->prob.p.single_ref[c][0]);
481                 td->counts.single_ref[c][0][bit]++;
482                 if (!bit) {
483                     b->ref[0] = 0;
484                 } else {
485                     // FIXME can this codeblob be replaced by some sort of LUT?
486                     if (have_a) {
487                         if (have_l) {
488                             if (td->left_intra_ctx[row7]) {
489                                 if (s->above_intra_ctx[col]) {
490                                     c = 2;
491                                 } else if (s->above_comp_ctx[col]) {
492                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
493                                                  s->above_ref_ctx[col] == 1);
494                                 } else if (!s->above_ref_ctx[col]) {
495                                     c = 3;
496                                 } else {
497                                     c = 4 * (s->above_ref_ctx[col] == 1);
498                                 }
499                             } else if (s->above_intra_ctx[col]) {
500                                 if (td->left_intra_ctx[row7]) {
501                                     c = 2;
502                                 } else if (td->left_comp_ctx[row7]) {
503                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
504                                                  td->left_ref_ctx[row7] == 1);
505                                 } else if (!td->left_ref_ctx[row7]) {
506                                     c = 3;
507                                 } else {
508                                     c = 4 * (td->left_ref_ctx[row7] == 1);
509                                 }
510                             } else if (s->above_comp_ctx[col]) {
511                                 if (td->left_comp_ctx[row7]) {
512                                     if (td->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
513                                         c = 3 * (s->s.h.fixcompref == 1 ||
514                                                  td->left_ref_ctx[row7] == 1);
515                                     } else {
516                                         c = 2;
517                                     }
518                                 } else if (!td->left_ref_ctx[row7]) {
519                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
520                                                  s->above_ref_ctx[col] == 1);
521                                 } else {
522                                     c = 3 * (td->left_ref_ctx[row7] == 1) +
523                                     (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
524                                 }
525                             } else if (td->left_comp_ctx[row7]) {
526                                 if (!s->above_ref_ctx[col]) {
527                                     c = 1 + 2 * (s->s.h.fixcompref == 1 ||
528                                                  td->left_ref_ctx[row7] == 1);
529                                 } else {
530                                     c = 3 * (s->above_ref_ctx[col] == 1) +
531                                     (s->s.h.fixcompref == 1 || td->left_ref_ctx[row7] == 1);
532                                 }
533                             } else if (!s->above_ref_ctx[col]) {
534                                 if (!td->left_ref_ctx[row7]) {
535                                     c = 3;
536                                 } else {
537                                     c = 4 * (td->left_ref_ctx[row7] == 1);
538                                 }
539                             } else if (!td->left_ref_ctx[row7]) {
540                                 c = 4 * (s->above_ref_ctx[col] == 1);
541                             } else {
542                                 c = 2 * (td->left_ref_ctx[row7] == 1) +
543                                     2 * (s->above_ref_ctx[col] == 1);
544                             }
545                         } else {
546                             if (s->above_intra_ctx[col] ||
547                                 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
548                                 c = 2;
549                             } else if (s->above_comp_ctx[col]) {
550                                 c = 3 * (s->s.h.fixcompref == 1 || s->above_ref_ctx[col] == 1);
551                             } else {
552                                 c = 4 * (s->above_ref_ctx[col] == 1);
553                             }
554                         }
555                     } else if (have_l) {
556                         if (td->left_intra_ctx[row7] ||
557                             (!td->left_comp_ctx[row7] && !td->left_ref_ctx[row7])) {
558                             c = 2;
559                         } else if (td->left_comp_ctx[row7]) {
560                             c = 3 * (s->s.h.fixcompref == 1 || td->left_ref_ctx[row7] == 1);
561                         } else {
562                             c = 4 * (td->left_ref_ctx[row7] == 1);
563                         }
564                     } else {
565                         c = 2;
566                     }
567                     bit = vp56_rac_get_prob(td->c, s->prob.p.single_ref[c][1]);
568                     td->counts.single_ref[c][1][bit]++;
569                     b->ref[0] = 1 + bit;
570                 }
571             }
572         }
573 
574         if (b->bs <= BS_8x8) {
575             if (s->s.h.segmentation.enabled && s->s.h.segmentation.feat[b->seg_id].skip_enabled) {
576                 b->mode[0] =
577                 b->mode[1] =
578                 b->mode[2] =
579                 b->mode[3] = ZEROMV;
580             } else {
581                 static const uint8_t off[10] = {
582                     3, 0, 0, 1, 0, 0, 0, 0, 0, 0
583                 };
584 
585                 // FIXME this needs to use the LUT tables from find_ref_mvs
586                 // because not all are -1,0/0,-1
587                 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
588                                           [td->left_mode_ctx[row7 + off[b->bs]]];
589 
590                 b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
591                                               s->prob.p.mv_mode[c]);
592                 b->mode[1] =
593                 b->mode[2] =
594                 b->mode[3] = b->mode[0];
595                 td->counts.mv_mode[c][b->mode[0] - 10]++;
596             }
597         }
598 
599         if (s->s.h.filtermode == FILTER_SWITCHABLE) {
600             int c;
601 
602             if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
603                 if (have_l && td->left_mode_ctx[row7] >= NEARESTMV) {
604                     c = s->above_filter_ctx[col] == td->left_filter_ctx[row7] ?
605                         td->left_filter_ctx[row7] : 3;
606                 } else {
607                     c = s->above_filter_ctx[col];
608                 }
609             } else if (have_l && td->left_mode_ctx[row7] >= NEARESTMV) {
610                 c = td->left_filter_ctx[row7];
611             } else {
612                 c = 3;
613             }
614 
615             filter_id = vp8_rac_get_tree(td->c, ff_vp9_filter_tree,
616                                          s->prob.p.filter[c]);
617             td->counts.filter[c][filter_id]++;
618             b->filter = ff_vp9_filter_lut[filter_id];
619         } else {
620             b->filter = s->s.h.filtermode;
621         }
622 
623         if (b->bs > BS_8x8) {
624             int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][td->left_mode_ctx[row7]];
625 
626             b->mode[0] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
627                                           s->prob.p.mv_mode[c]);
628             td->counts.mv_mode[c][b->mode[0] - 10]++;
629             ff_vp9_fill_mv(td, b->mv[0], b->mode[0], 0);
630 
631             if (b->bs != BS_8x4) {
632                 b->mode[1] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
633                                               s->prob.p.mv_mode[c]);
634                 td->counts.mv_mode[c][b->mode[1] - 10]++;
635                 ff_vp9_fill_mv(td, b->mv[1], b->mode[1], 1);
636             } else {
637                 b->mode[1] = b->mode[0];
638                 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
639                 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
640             }
641 
642             if (b->bs != BS_4x8) {
643                 b->mode[2] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
644                                               s->prob.p.mv_mode[c]);
645                 td->counts.mv_mode[c][b->mode[2] - 10]++;
646                 ff_vp9_fill_mv(td, b->mv[2], b->mode[2], 2);
647 
648                 if (b->bs != BS_8x4) {
649                     b->mode[3] = vp8_rac_get_tree(td->c, ff_vp9_inter_mode_tree,
650                                                   s->prob.p.mv_mode[c]);
651                     td->counts.mv_mode[c][b->mode[3] - 10]++;
652                     ff_vp9_fill_mv(td, b->mv[3], b->mode[3], 3);
653                 } else {
654                     b->mode[3] = b->mode[2];
655                     AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
656                     AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
657                 }
658             } else {
659                 b->mode[2] = b->mode[0];
660                 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
661                 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
662                 b->mode[3] = b->mode[1];
663                 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
664                 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
665             }
666         } else {
667             ff_vp9_fill_mv(td, b->mv[0], b->mode[0], -1);
668             AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
669             AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
670             AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
671             AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
672             AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
673             AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
674         }
675 
676         vref = b->ref[b->comp ? s->s.h.signbias[s->s.h.varcompref[0]] : 0];
677     }
678 
679 #if HAVE_FAST_64BIT
680 #define SPLAT_CTX(var, val, n) \
681     switch (n) { \
682     case 1:  var = val;                                    break; \
683     case 2:  AV_WN16A(&var, val *             0x0101);     break; \
684     case 4:  AV_WN32A(&var, val *         0x01010101);     break; \
685     case 8:  AV_WN64A(&var, val * 0x0101010101010101ULL);  break; \
686     case 16: { \
687         uint64_t v64 = val * 0x0101010101010101ULL; \
688         AV_WN64A(              &var,     v64); \
689         AV_WN64A(&((uint8_t *) &var)[8], v64); \
690         break; \
691     } \
692     }
693 #else
694 #define SPLAT_CTX(var, val, n) \
695     switch (n) { \
696     case 1:  var = val;                         break; \
697     case 2:  AV_WN16A(&var, val *     0x0101);  break; \
698     case 4:  AV_WN32A(&var, val * 0x01010101);  break; \
699     case 8: { \
700         uint32_t v32 = val * 0x01010101; \
701         AV_WN32A(              &var,     v32); \
702         AV_WN32A(&((uint8_t *) &var)[4], v32); \
703         break; \
704     } \
705     case 16: { \
706         uint32_t v32 = val * 0x01010101; \
707         AV_WN32A(              &var,      v32); \
708         AV_WN32A(&((uint8_t *) &var)[4],  v32); \
709         AV_WN32A(&((uint8_t *) &var)[8],  v32); \
710         AV_WN32A(&((uint8_t *) &var)[12], v32); \
711         break; \
712     } \
713     }
714 #endif
715 
716     switch (ff_vp9_bwh_tab[1][b->bs][0]) {
717 #define SET_CTXS(perf, dir, off, n) \
718     do { \
719         SPLAT_CTX(perf->dir##_skip_ctx[off],      b->skip,          n); \
720         SPLAT_CTX(perf->dir##_txfm_ctx[off],      b->tx,            n); \
721         SPLAT_CTX(perf->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
722         if (!s->s.h.keyframe && !s->s.h.intraonly) { \
723             SPLAT_CTX(perf->dir##_intra_ctx[off], b->intra,   n); \
724             SPLAT_CTX(perf->dir##_comp_ctx[off],  b->comp,    n); \
725             SPLAT_CTX(perf->dir##_mode_ctx[off],  b->mode[3], n); \
726             if (!b->intra) { \
727                 SPLAT_CTX(perf->dir##_ref_ctx[off], vref, n); \
728                 if (s->s.h.filtermode == FILTER_SWITCHABLE) { \
729                     SPLAT_CTX(perf->dir##_filter_ctx[off], filter_id, n); \
730                 } \
731             } \
732         } \
733     } while (0)
734     case 1: SET_CTXS(s, above, col, 1); break;
735     case 2: SET_CTXS(s, above, col, 2); break;
736     case 4: SET_CTXS(s, above, col, 4); break;
737     case 8: SET_CTXS(s, above, col, 8); break;
738     }
739     switch (ff_vp9_bwh_tab[1][b->bs][1]) {
740     case 1: SET_CTXS(td, left, row7, 1); break;
741     case 2: SET_CTXS(td, left, row7, 2); break;
742     case 4: SET_CTXS(td, left, row7, 4); break;
743     case 8: SET_CTXS(td, left, row7, 8); break;
744     }
745 #undef SPLAT_CTX
746 #undef SET_CTXS
747 
748     if (!s->s.h.keyframe && !s->s.h.intraonly) {
749         if (b->bs > BS_8x8) {
750             int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
751 
752             AV_COPY32(&td->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
753             AV_COPY32(&td->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
754             AV_WN32A(&td->left_mv_ctx[row7 * 2 + 1][0], mv0);
755             AV_WN32A(&td->left_mv_ctx[row7 * 2 + 1][1], mv1);
756             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
757             AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
758             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
759             AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
760         } else {
761             int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
762 
763             for (n = 0; n < w4 * 2; n++) {
764                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
765                 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
766             }
767             for (n = 0; n < h4 * 2; n++) {
768                 AV_WN32A(&td->left_mv_ctx[row7 * 2 + n][0], mv0);
769                 AV_WN32A(&td->left_mv_ctx[row7 * 2 + n][1], mv1);
770             }
771         }
772     }
773 
774     // FIXME kinda ugly
775     for (y = 0; y < h4; y++) {
776         int x, o = (row + y) * s->sb_cols * 8 + col;
777         VP9mvrefPair *mv = &s->s.frames[CUR_FRAME].mv[o];
778 
779         if (b->intra) {
780             for (x = 0; x < w4; x++) {
781                 mv[x].ref[0] =
782                 mv[x].ref[1] = -1;
783             }
784         } else if (b->comp) {
785             for (x = 0; x < w4; x++) {
786                 mv[x].ref[0] = b->ref[0];
787                 mv[x].ref[1] = b->ref[1];
788                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
789                 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
790             }
791         } else {
792             for (x = 0; x < w4; x++) {
793                 mv[x].ref[0] = b->ref[0];
794                 mv[x].ref[1] = -1;
795                 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
796             }
797         }
798     }
799 }
800 
801 // FIXME merge cnt/eob arguments?
802 static av_always_inline int
decode_coeffs_b_generic(VP56RangeCoder * c,int16_t * coef,int n_coeffs,int is_tx32x32,int is8bitsperpixel,int bpp,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,int16_t * qmul)803 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
804                         int is_tx32x32, int is8bitsperpixel, int bpp, unsigned (*cnt)[6][3],
805                         unsigned (*eob)[6][2], uint8_t (*p)[6][11],
806                         int nnz, const int16_t *scan, const int16_t (*nb)[2],
807                         const int16_t *band_counts, int16_t *qmul)
808 {
809     int i = 0, band = 0, band_left = band_counts[band];
810     const uint8_t *tp = p[0][nnz];
811     uint8_t cache[1024];
812 
813     do {
814         int val, rc;
815 
816         val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
817         eob[band][nnz][val]++;
818         if (!val)
819             break;
820 
821 skip_eob:
822         if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
823             cnt[band][nnz][0]++;
824             if (!--band_left)
825                 band_left = band_counts[++band];
826             cache[scan[i]] = 0;
827             nnz            = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
828             tp             = p[band][nnz];
829             if (++i == n_coeffs)
830                 break;  //invalid input; blocks should end with EOB
831             goto skip_eob;
832         }
833 
834         rc = scan[i];
835         if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
836             cnt[band][nnz][1]++;
837             val       = 1;
838             cache[rc] = 1;
839         } else {
840             cnt[band][nnz][2]++;
841             if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
842                 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
843                     cache[rc] = val = 2;
844                 } else {
845                     val       = 3 + vp56_rac_get_prob(c, tp[5]);
846                     cache[rc] = 3;
847                 }
848             } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
849                 cache[rc] = 4;
850                 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
851                     val  =  vp56_rac_get_prob(c, 159) + 5;
852                 } else {
853                     val  = (vp56_rac_get_prob(c, 165) << 1) + 7;
854                     val +=  vp56_rac_get_prob(c, 145);
855                 }
856             } else { // cat 3-6
857                 cache[rc] = 5;
858                 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
859                     if (!vp56_rac_get_prob_branchy(c, tp[9])) {
860                         val  = 11 + (vp56_rac_get_prob(c, 173) << 2);
861                         val +=      (vp56_rac_get_prob(c, 148) << 1);
862                         val +=       vp56_rac_get_prob(c, 140);
863                     } else {
864                         val  = 19 + (vp56_rac_get_prob(c, 176) << 3);
865                         val +=      (vp56_rac_get_prob(c, 155) << 2);
866                         val +=      (vp56_rac_get_prob(c, 140) << 1);
867                         val +=       vp56_rac_get_prob(c, 135);
868                     }
869                 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
870                     val  = (vp56_rac_get_prob(c, 180) << 4) + 35;
871                     val += (vp56_rac_get_prob(c, 157) << 3);
872                     val += (vp56_rac_get_prob(c, 141) << 2);
873                     val += (vp56_rac_get_prob(c, 134) << 1);
874                     val +=  vp56_rac_get_prob(c, 130);
875                 } else {
876                     val = 67;
877                     if (!is8bitsperpixel) {
878                         if (bpp == 12) {
879                             val += vp56_rac_get_prob(c, 255) << 17;
880                             val += vp56_rac_get_prob(c, 255) << 16;
881                         }
882                         val +=  (vp56_rac_get_prob(c, 255) << 15);
883                         val +=  (vp56_rac_get_prob(c, 255) << 14);
884                     }
885                     val += (vp56_rac_get_prob(c, 254) << 13);
886                     val += (vp56_rac_get_prob(c, 254) << 12);
887                     val += (vp56_rac_get_prob(c, 254) << 11);
888                     val += (vp56_rac_get_prob(c, 252) << 10);
889                     val += (vp56_rac_get_prob(c, 249) << 9);
890                     val += (vp56_rac_get_prob(c, 243) << 8);
891                     val += (vp56_rac_get_prob(c, 230) << 7);
892                     val += (vp56_rac_get_prob(c, 196) << 6);
893                     val += (vp56_rac_get_prob(c, 177) << 5);
894                     val += (vp56_rac_get_prob(c, 153) << 4);
895                     val += (vp56_rac_get_prob(c, 140) << 3);
896                     val += (vp56_rac_get_prob(c, 133) << 2);
897                     val += (vp56_rac_get_prob(c, 130) << 1);
898                     val +=  vp56_rac_get_prob(c, 129);
899                 }
900             }
901         }
902 #define STORE_COEF(c, i, v) do { \
903     if (is8bitsperpixel) { \
904         c[i] = v; \
905     } else { \
906         AV_WN32A(&c[i * 2], v); \
907     } \
908 } while (0)
909         if (!--band_left)
910             band_left = band_counts[++band];
911         if (is_tx32x32)
912             STORE_COEF(coef, rc, (int)((vp8_rac_get(c) ? -val : val) * (unsigned)qmul[!!i]) / 2);
913         else
914             STORE_COEF(coef, rc, (vp8_rac_get(c) ? -val : val) * (unsigned)qmul[!!i]);
915         nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
916         tp = p[band][nnz];
917     } while (++i < n_coeffs);
918 
919     return i;
920 }
921 
decode_coeffs_b_8bpp(VP9TileData * td,int16_t * coef,int n_coeffs,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,int16_t * qmul)922 static int decode_coeffs_b_8bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
923                                 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
924                                 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
925                                 const int16_t (*nb)[2], const int16_t *band_counts,
926                                 int16_t *qmul)
927 {
928     return decode_coeffs_b_generic(td->c, coef, n_coeffs, 0, 1, 8, cnt, eob, p,
929                                    nnz, scan, nb, band_counts, qmul);
930 }
931 
decode_coeffs_b32_8bpp(VP9TileData * td,int16_t * coef,int n_coeffs,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,int16_t * qmul)932 static int decode_coeffs_b32_8bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
933                                   unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
934                                   uint8_t (*p)[6][11], int nnz, const int16_t *scan,
935                                   const int16_t (*nb)[2], const int16_t *band_counts,
936                                   int16_t *qmul)
937 {
938     return decode_coeffs_b_generic(td->c, coef, n_coeffs, 1, 1, 8, cnt, eob, p,
939                                    nnz, scan, nb, band_counts, qmul);
940 }
941 
decode_coeffs_b_16bpp(VP9TileData * td,int16_t * coef,int n_coeffs,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,int16_t * qmul)942 static int decode_coeffs_b_16bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
943                                  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
944                                  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
945                                  const int16_t (*nb)[2], const int16_t *band_counts,
946                                  int16_t *qmul)
947 {
948     return decode_coeffs_b_generic(td->c, coef, n_coeffs, 0, 0, td->s->s.h.bpp, cnt, eob, p,
949                                    nnz, scan, nb, band_counts, qmul);
950 }
951 
decode_coeffs_b32_16bpp(VP9TileData * td,int16_t * coef,int n_coeffs,unsigned (* cnt)[6][3],unsigned (* eob)[6][2],uint8_t (* p)[6][11],int nnz,const int16_t * scan,const int16_t (* nb)[2],const int16_t * band_counts,int16_t * qmul)952 static int decode_coeffs_b32_16bpp(VP9TileData *td, int16_t *coef, int n_coeffs,
953                                    unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
954                                    uint8_t (*p)[6][11], int nnz, const int16_t *scan,
955                                    const int16_t (*nb)[2], const int16_t *band_counts,
956                                    int16_t *qmul)
957 {
958     return decode_coeffs_b_generic(td->c, coef, n_coeffs, 1, 0, td->s->s.h.bpp, cnt, eob, p,
959                                    nnz, scan, nb, band_counts, qmul);
960 }
961 
decode_coeffs(VP9TileData * td,int is8bitsperpixel)962 static av_always_inline int decode_coeffs(VP9TileData *td, int is8bitsperpixel)
963 {
964     VP9Context *s = td->s;
965     VP9Block *b = td->b;
966     int row = td->row, col = td->col;
967     uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
968     unsigned (*c)[6][3] = td->counts.coef[b->tx][0 /* y */][!b->intra];
969     unsigned (*e)[6][2] = td->counts.eob[b->tx][0 /* y */][!b->intra];
970     int w4 = ff_vp9_bwh_tab[1][b->bs][0] << 1, h4 = ff_vp9_bwh_tab[1][b->bs][1] << 1;
971     int end_x = FFMIN(2 * (s->cols - col), w4);
972     int end_y = FFMIN(2 * (s->rows - row), h4);
973     int n, pl, x, y, ret;
974     int16_t (*qmul)[2] = s->s.h.segmentation.feat[b->seg_id].qmul;
975     int tx = 4 * s->s.h.lossless + b->tx;
976     const int16_t * const *yscans = ff_vp9_scans[tx];
977     const int16_t (* const * ynbs)[2] = ff_vp9_scans_nb[tx];
978     const int16_t *uvscan = ff_vp9_scans[b->uvtx][DCT_DCT];
979     const int16_t (*uvnb)[2] = ff_vp9_scans_nb[b->uvtx][DCT_DCT];
980     uint8_t *a = &s->above_y_nnz_ctx[col * 2];
981     uint8_t *l = &td->left_y_nnz_ctx[(row & 7) << 1];
982     static const int16_t band_counts[4][8] = {
983         { 1, 2, 3, 4,  3,   16 - 13 },
984         { 1, 2, 3, 4, 11,   64 - 21 },
985         { 1, 2, 3, 4, 11,  256 - 21 },
986         { 1, 2, 3, 4, 11, 1024 - 21 },
987     };
988     const int16_t *y_band_counts = band_counts[b->tx];
989     const int16_t *uv_band_counts = band_counts[b->uvtx];
990     int bytesperpixel = is8bitsperpixel ? 1 : 2;
991     int total_coeff = 0;
992 
993 #define MERGE(la, end, step, rd) \
994     for (n = 0; n < end; n += step) \
995         la[n] = !!rd(&la[n])
996 #define MERGE_CTX(step, rd) \
997     do { \
998         MERGE(l, end_y, step, rd); \
999         MERGE(a, end_x, step, rd); \
1000     } while (0)
1001 
1002 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
1003     for (n = 0, y = 0; y < end_y; y += step) { \
1004         for (x = 0; x < end_x; x += step, n += step * step) { \
1005             enum TxfmType txtp = ff_vp9_intra_txfm_type[b->mode[mode_index]]; \
1006             ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
1007                                     (td, td->block + 16 * n * bytesperpixel, 16 * step * step, \
1008                                      c, e, p, a[x] + l[y], yscans[txtp], \
1009                                      ynbs[txtp], y_band_counts, qmul[0]); \
1010             a[x] = l[y] = !!ret; \
1011             total_coeff |= !!ret; \
1012             if (step >= 4) { \
1013                 AV_WN16A(&td->eob[n], ret); \
1014             } else { \
1015                 td->eob[n] = ret; \
1016             } \
1017         } \
1018     }
1019 
1020 #define SPLAT(la, end, step, cond) \
1021     if (step == 2) { \
1022         for (n = 1; n < end; n += step) \
1023             la[n] = la[n - 1]; \
1024     } else if (step == 4) { \
1025         if (cond) { \
1026             for (n = 0; n < end; n += step) \
1027                 AV_WN32A(&la[n], la[n] * 0x01010101); \
1028         } else { \
1029             for (n = 0; n < end; n += step) \
1030                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
1031         } \
1032     } else /* step == 8 */ { \
1033         if (cond) { \
1034             if (HAVE_FAST_64BIT) { \
1035                 for (n = 0; n < end; n += step) \
1036                     AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
1037             } else { \
1038                 for (n = 0; n < end; n += step) { \
1039                     uint32_t v32 = la[n] * 0x01010101; \
1040                     AV_WN32A(&la[n],     v32); \
1041                     AV_WN32A(&la[n + 4], v32); \
1042                 } \
1043             } \
1044         } else { \
1045             for (n = 0; n < end; n += step) \
1046                 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
1047         } \
1048     }
1049 #define SPLAT_CTX(step) \
1050     do { \
1051         SPLAT(a, end_x, step, end_x == w4); \
1052         SPLAT(l, end_y, step, end_y == h4); \
1053     } while (0)
1054 
1055     /* y tokens */
1056     switch (b->tx) {
1057     case TX_4X4:
1058         DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
1059         break;
1060     case TX_8X8:
1061         MERGE_CTX(2, AV_RN16A);
1062         DECODE_Y_COEF_LOOP(2, 0,);
1063         SPLAT_CTX(2);
1064         break;
1065     case TX_16X16:
1066         MERGE_CTX(4, AV_RN32A);
1067         DECODE_Y_COEF_LOOP(4, 0,);
1068         SPLAT_CTX(4);
1069         break;
1070     case TX_32X32:
1071         MERGE_CTX(8, AV_RN64A);
1072         DECODE_Y_COEF_LOOP(8, 0, 32);
1073         SPLAT_CTX(8);
1074         break;
1075     }
1076 
1077 #define DECODE_UV_COEF_LOOP(step, v) \
1078     for (n = 0, y = 0; y < end_y; y += step) { \
1079         for (x = 0; x < end_x; x += step, n += step * step) { \
1080             ret = (is8bitsperpixel ? decode_coeffs_b##v##_8bpp : decode_coeffs_b##v##_16bpp) \
1081                                     (td, td->uvblock[pl] + 16 * n * bytesperpixel, \
1082                                      16 * step * step, c, e, p, a[x] + l[y], \
1083                                      uvscan, uvnb, uv_band_counts, qmul[1]); \
1084             a[x] = l[y] = !!ret; \
1085             total_coeff |= !!ret; \
1086             if (step >= 4) { \
1087                 AV_WN16A(&td->uveob[pl][n], ret); \
1088             } else { \
1089                 td->uveob[pl][n] = ret; \
1090             } \
1091         } \
1092     }
1093 
1094     p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
1095     c = td->counts.coef[b->uvtx][1 /* uv */][!b->intra];
1096     e = td->counts.eob[b->uvtx][1 /* uv */][!b->intra];
1097     w4 >>= s->ss_h;
1098     end_x >>= s->ss_h;
1099     h4 >>= s->ss_v;
1100     end_y >>= s->ss_v;
1101     for (pl = 0; pl < 2; pl++) {
1102         a = &s->above_uv_nnz_ctx[pl][col << !s->ss_h];
1103         l = &td->left_uv_nnz_ctx[pl][(row & 7) << !s->ss_v];
1104         switch (b->uvtx) {
1105         case TX_4X4:
1106             DECODE_UV_COEF_LOOP(1,);
1107             break;
1108         case TX_8X8:
1109             MERGE_CTX(2, AV_RN16A);
1110             DECODE_UV_COEF_LOOP(2,);
1111             SPLAT_CTX(2);
1112             break;
1113         case TX_16X16:
1114             MERGE_CTX(4, AV_RN32A);
1115             DECODE_UV_COEF_LOOP(4,);
1116             SPLAT_CTX(4);
1117             break;
1118         case TX_32X32:
1119             MERGE_CTX(8, AV_RN64A);
1120             DECODE_UV_COEF_LOOP(8, 32);
1121             SPLAT_CTX(8);
1122             break;
1123         }
1124     }
1125 
1126     return total_coeff;
1127 }
1128 
decode_coeffs_8bpp(VP9TileData * td)1129 static int decode_coeffs_8bpp(VP9TileData *td)
1130 {
1131     return decode_coeffs(td, 1);
1132 }
1133 
decode_coeffs_16bpp(VP9TileData * td)1134 static int decode_coeffs_16bpp(VP9TileData *td)
1135 {
1136     return decode_coeffs(td, 0);
1137 }
1138 
mask_edges(uint8_t (* mask)[8][4],int ss_h,int ss_v,int row_and_7,int col_and_7,int w,int h,int col_end,int row_end,enum TxfmMode tx,int skip_inter)1139 static av_always_inline void mask_edges(uint8_t (*mask)[8][4], int ss_h, int ss_v,
1140                                         int row_and_7, int col_and_7,
1141                                         int w, int h, int col_end, int row_end,
1142                                         enum TxfmMode tx, int skip_inter)
1143 {
1144     static const unsigned wide_filter_col_mask[2] = { 0x11, 0x01 };
1145     static const unsigned wide_filter_row_mask[2] = { 0x03, 0x07 };
1146 
1147     // FIXME I'm pretty sure all loops can be replaced by a single LUT if
1148     // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
1149     // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
1150     // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
1151 
1152     // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
1153     // edges. This means that for UV, we work on two subsampled blocks at
1154     // a time, and we only use the topleft block's mode information to set
1155     // things like block strength. Thus, for any block size smaller than
1156     // 16x16, ignore the odd portion of the block.
1157     if (tx == TX_4X4 && (ss_v | ss_h)) {
1158         if (h == ss_v) {
1159             if (row_and_7 & 1)
1160                 return;
1161             if (!row_end)
1162                 h += 1;
1163         }
1164         if (w == ss_h) {
1165             if (col_and_7 & 1)
1166                 return;
1167             if (!col_end)
1168                 w += 1;
1169         }
1170     }
1171 
1172     if (tx == TX_4X4 && !skip_inter) {
1173         int t = 1 << col_and_7, m_col = (t << w) - t, y;
1174         // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
1175         int m_row_8 = m_col & wide_filter_col_mask[ss_h], m_row_4 = m_col - m_row_8;
1176 
1177         for (y = row_and_7; y < h + row_and_7; y++) {
1178             int col_mask_id = 2 - !(y & wide_filter_row_mask[ss_v]);
1179 
1180             mask[0][y][1] |= m_row_8;
1181             mask[0][y][2] |= m_row_4;
1182             // for odd lines, if the odd col is not being filtered,
1183             // skip odd row also:
1184             // .---. <-- a
1185             // |   |
1186             // |___| <-- b
1187             // ^   ^
1188             // c   d
1189             //
1190             // if a/c are even row/col and b/d are odd, and d is skipped,
1191             // e.g. right edge of size-66x66.webm, then skip b also (bug)
1192             if ((ss_h & ss_v) && (col_end & 1) && (y & 1)) {
1193                 mask[1][y][col_mask_id] |= (t << (w - 1)) - t;
1194             } else {
1195                 mask[1][y][col_mask_id] |= m_col;
1196             }
1197             if (!ss_h)
1198                 mask[0][y][3] |= m_col;
1199             if (!ss_v) {
1200                 if (ss_h && (col_end & 1))
1201                     mask[1][y][3] |= (t << (w - 1)) - t;
1202                 else
1203                     mask[1][y][3] |= m_col;
1204             }
1205         }
1206     } else {
1207         int y, t = 1 << col_and_7, m_col = (t << w) - t;
1208 
1209         if (!skip_inter) {
1210             int mask_id = (tx == TX_8X8);
1211             int l2 = tx + ss_h - 1, step1d;
1212             static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
1213             int m_row = m_col & masks[l2];
1214 
1215             // at odd UV col/row edges tx16/tx32 loopfilter edges, force
1216             // 8wd loopfilter to prevent going off the visible edge.
1217             if (ss_h && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
1218                 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
1219                 int m_row_8 = m_row - m_row_16;
1220 
1221                 for (y = row_and_7; y < h + row_and_7; y++) {
1222                     mask[0][y][0] |= m_row_16;
1223                     mask[0][y][1] |= m_row_8;
1224                 }
1225             } else {
1226                 for (y = row_and_7; y < h + row_and_7; y++)
1227                     mask[0][y][mask_id] |= m_row;
1228             }
1229 
1230             l2 = tx + ss_v - 1;
1231             step1d = 1 << l2;
1232             if (ss_v && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
1233                 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
1234                     mask[1][y][0] |= m_col;
1235                 if (y - row_and_7 == h - 1)
1236                     mask[1][y][1] |= m_col;
1237             } else {
1238                 for (y = row_and_7; y < h + row_and_7; y += step1d)
1239                     mask[1][y][mask_id] |= m_col;
1240             }
1241         } else if (tx != TX_4X4) {
1242             int mask_id;
1243 
1244             mask_id = (tx == TX_8X8) || (h == ss_v);
1245             mask[1][row_and_7][mask_id] |= m_col;
1246             mask_id = (tx == TX_8X8) || (w == ss_h);
1247             for (y = row_and_7; y < h + row_and_7; y++)
1248                 mask[0][y][mask_id] |= t;
1249         } else {
1250             int t8 = t & wide_filter_col_mask[ss_h], t4 = t - t8;
1251 
1252             for (y = row_and_7; y < h + row_and_7; y++) {
1253                 mask[0][y][2] |= t4;
1254                 mask[0][y][1] |= t8;
1255             }
1256             mask[1][row_and_7][2 - !(row_and_7 & wide_filter_row_mask[ss_v])] |= m_col;
1257         }
1258     }
1259 }
1260 
ff_vp9_decode_block(VP9TileData * td,int row,int col,VP9Filter * lflvl,ptrdiff_t yoff,ptrdiff_t uvoff,enum BlockLevel bl,enum BlockPartition bp)1261 void ff_vp9_decode_block(VP9TileData *td, int row, int col,
1262                          VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
1263                          enum BlockLevel bl, enum BlockPartition bp)
1264 {
1265     VP9Context *s = td->s;
1266     VP9Block *b = td->b;
1267     enum BlockSize bs = bl * 3 + bp;
1268     int bytesperpixel = s->bytesperpixel;
1269     int w4 = ff_vp9_bwh_tab[1][bs][0], h4 = ff_vp9_bwh_tab[1][bs][1], lvl;
1270     int emu[2];
1271     AVFrame *f = s->s.frames[CUR_FRAME].tf.f;
1272 
1273     td->row = row;
1274     td->row7 = row & 7;
1275     td->col = col;
1276     td->col7 = col & 7;
1277 
1278     td->min_mv.x = -(128 + col * 64);
1279     td->min_mv.y = -(128 + row * 64);
1280     td->max_mv.x = 128 + (s->cols - col - w4) * 64;
1281     td->max_mv.y = 128 + (s->rows - row - h4) * 64;
1282 
1283     if (s->pass < 2) {
1284         b->bs = bs;
1285         b->bl = bl;
1286         b->bp = bp;
1287         decode_mode(td);
1288         b->uvtx = b->tx - ((s->ss_h && w4 * 2 == (1 << b->tx)) ||
1289                            (s->ss_v && h4 * 2 == (1 << b->tx)));
1290 
1291         if (td->block_structure) {
1292             td->block_structure[td->nb_block_structure].row = row;
1293             td->block_structure[td->nb_block_structure].col = col;
1294             td->block_structure[td->nb_block_structure].block_size_idx_x = av_log2(w4);
1295             td->block_structure[td->nb_block_structure].block_size_idx_y = av_log2(h4);
1296             td->nb_block_structure++;
1297         }
1298 
1299         if (!b->skip) {
1300             int has_coeffs;
1301 
1302             if (bytesperpixel == 1) {
1303                 has_coeffs = decode_coeffs_8bpp(td);
1304             } else {
1305                 has_coeffs = decode_coeffs_16bpp(td);
1306             }
1307             if (!has_coeffs && b->bs <= BS_8x8 && !b->intra) {
1308                 b->skip = 1;
1309                 memset(&s->above_skip_ctx[col], 1, w4);
1310                 memset(&td->left_skip_ctx[td->row7], 1, h4);
1311             }
1312         } else {
1313             int row7 = td->row7;
1314 
1315 #define SPLAT_ZERO_CTX(v, n) \
1316     switch (n) { \
1317     case 1:  v = 0;          break; \
1318     case 2:  AV_ZERO16(&v);  break; \
1319     case 4:  AV_ZERO32(&v);  break; \
1320     case 8:  AV_ZERO64(&v);  break; \
1321     case 16: AV_ZERO128(&v); break; \
1322     }
1323 #define SPLAT_ZERO_YUV(dir, var, off, n, dir2) \
1324     do { \
1325         SPLAT_ZERO_CTX(dir##_y_##var[off * 2], n * 2); \
1326         if (s->ss_##dir2) { \
1327             SPLAT_ZERO_CTX(dir##_uv_##var[0][off], n); \
1328             SPLAT_ZERO_CTX(dir##_uv_##var[1][off], n); \
1329         } else { \
1330             SPLAT_ZERO_CTX(dir##_uv_##var[0][off * 2], n * 2); \
1331             SPLAT_ZERO_CTX(dir##_uv_##var[1][off * 2], n * 2); \
1332         } \
1333     } while (0)
1334 
1335             switch (w4) {
1336             case 1: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 1, h); break;
1337             case 2: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 2, h); break;
1338             case 4: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 4, h); break;
1339             case 8: SPLAT_ZERO_YUV(s->above, nnz_ctx, col, 8, h); break;
1340             }
1341             switch (h4) {
1342             case 1: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 1, v); break;
1343             case 2: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 2, v); break;
1344             case 4: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 4, v); break;
1345             case 8: SPLAT_ZERO_YUV(td->left, nnz_ctx, row7, 8, v); break;
1346             }
1347         }
1348 
1349         if (s->pass == 1) {
1350             s->td[0].b++;
1351             s->td[0].block += w4 * h4 * 64 * bytesperpixel;
1352             s->td[0].uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
1353             s->td[0].uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_h + s->ss_v);
1354             s->td[0].eob += 4 * w4 * h4;
1355             s->td[0].uveob[0] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
1356             s->td[0].uveob[1] += 4 * w4 * h4 >> (s->ss_h + s->ss_v);
1357 
1358             return;
1359         }
1360     }
1361 
1362     // emulated overhangs if the stride of the target buffer can't hold. This
1363     // makes it possible to support emu-edge and so on even if we have large block
1364     // overhangs
1365     emu[0] = (col + w4) * 8 * bytesperpixel > f->linesize[0] ||
1366              (row + h4) > s->rows;
1367     emu[1] = ((col + w4) * 8 >> s->ss_h) * bytesperpixel > f->linesize[1] ||
1368              (row + h4) > s->rows;
1369     if (emu[0]) {
1370         td->dst[0] = td->tmp_y;
1371         td->y_stride = 128;
1372     } else {
1373         td->dst[0] = f->data[0] + yoff;
1374         td->y_stride = f->linesize[0];
1375     }
1376     if (emu[1]) {
1377         td->dst[1] = td->tmp_uv[0];
1378         td->dst[2] = td->tmp_uv[1];
1379         td->uv_stride = 128;
1380     } else {
1381         td->dst[1] = f->data[1] + uvoff;
1382         td->dst[2] = f->data[2] + uvoff;
1383         td->uv_stride = f->linesize[1];
1384     }
1385     if (b->intra) {
1386         if (s->s.h.bpp > 8) {
1387             ff_vp9_intra_recon_16bpp(td, yoff, uvoff);
1388         } else {
1389             ff_vp9_intra_recon_8bpp(td, yoff, uvoff);
1390         }
1391     } else {
1392         if (s->s.h.bpp > 8) {
1393             ff_vp9_inter_recon_16bpp(td);
1394         } else {
1395             ff_vp9_inter_recon_8bpp(td);
1396         }
1397     }
1398     if (emu[0]) {
1399         int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
1400 
1401         for (n = 0; o < w; n++) {
1402             int bw = 64 >> n;
1403 
1404             av_assert2(n <= 4);
1405             if (w & bw) {
1406                 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o * bytesperpixel, f->linesize[0],
1407                                          td->tmp_y + o * bytesperpixel, 128, h, 0, 0);
1408                 o += bw;
1409             }
1410         }
1411     }
1412     if (emu[1]) {
1413         int w = FFMIN(s->cols - col, w4) * 8 >> s->ss_h;
1414         int h = FFMIN(s->rows - row, h4) * 8 >> s->ss_v, n, o = 0;
1415 
1416         for (n = s->ss_h; o < w; n++) {
1417             int bw = 64 >> n;
1418 
1419             av_assert2(n <= 4);
1420             if (w & bw) {
1421                 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o * bytesperpixel, f->linesize[1],
1422                                          td->tmp_uv[0] + o * bytesperpixel, 128, h, 0, 0);
1423                 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o * bytesperpixel, f->linesize[2],
1424                                          td->tmp_uv[1] + o * bytesperpixel, 128, h, 0, 0);
1425                 o += bw;
1426             }
1427         }
1428     }
1429 
1430     // pick filter level and find edges to apply filter to
1431     if (s->s.h.filter.level &&
1432         (lvl = s->s.h.segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
1433                                                       [b->mode[3] != ZEROMV]) > 0) {
1434         int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
1435         int skip_inter = !b->intra && b->skip, col7 = td->col7, row7 = td->row7;
1436 
1437         setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
1438         mask_edges(lflvl->mask[0], 0, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
1439         if (s->ss_h || s->ss_v)
1440             mask_edges(lflvl->mask[1], s->ss_h, s->ss_v, row7, col7, x_end, y_end,
1441                        s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
1442                        s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
1443                        b->uvtx, skip_inter);
1444     }
1445 
1446     if (s->pass == 2) {
1447         s->td[0].b++;
1448         s->td[0].block += w4 * h4 * 64 * bytesperpixel;
1449         s->td[0].uvblock[0] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
1450         s->td[0].uvblock[1] += w4 * h4 * 64 * bytesperpixel >> (s->ss_v + s->ss_h);
1451         s->td[0].eob += 4 * w4 * h4;
1452         s->td[0].uveob[0] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
1453         s->td[0].uveob[1] += 4 * w4 * h4 >> (s->ss_v + s->ss_h);
1454     }
1455 }
1456