• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 
12 #include "./vp9_rtcd.h"
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 
16 #include "vpx_dsp/quantize.h"
17 #include "vpx_mem/vpx_mem.h"
18 #include "vpx_ports/mem.h"
19 
20 #include "vp9/common/vp9_idct.h"
21 #include "vp9/common/vp9_reconinter.h"
22 #include "vp9/common/vp9_reconintra.h"
23 #include "vp9/common/vp9_scan.h"
24 
25 #include "vp9/encoder/vp9_encodemb.h"
26 #include "vp9/encoder/vp9_rd.h"
27 #include "vp9/encoder/vp9_tokenize.h"
28 
29 struct optimize_ctx {
30   ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
31   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
32 };
33 
vp9_subtract_plane(MACROBLOCK * x,BLOCK_SIZE bsize,int plane)34 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
35   struct macroblock_plane *const p = &x->plane[plane];
36   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
37   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
38   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
39   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
40 
41 #if CONFIG_VP9_HIGHBITDEPTH
42   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
43     vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
44                               p->src.stride, pd->dst.buf, pd->dst.stride,
45                               x->e_mbd.bd);
46     return;
47   }
48 #endif  // CONFIG_VP9_HIGHBITDEPTH
49   vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
50                      pd->dst.buf, pd->dst.stride);
51 }
52 
53 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
54 
55 typedef struct vp9_token_state {
56   int           rate;
57   int           error;
58   int           next;
59   int16_t       token;
60   int16_t       qc;
61 } vp9_token_state;
62 
63 // TODO(jimbankoski): experiment to find optimal RD numbers.
64 static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
65 
66 #define UPDATE_RD_COST()\
67 {\
68   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
69   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
70   if (rd_cost0 == rd_cost1) {\
71     rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
72     rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
73   }\
74 }
75 
76 // This function is a place holder for now but may ultimately need
77 // to scan previous tokens to work out the correct context.
trellis_get_coeff_context(const int16_t * scan,const int16_t * nb,int idx,int token,uint8_t * token_cache)78 static int trellis_get_coeff_context(const int16_t *scan,
79                                      const int16_t *nb,
80                                      int idx, int token,
81                                      uint8_t *token_cache) {
82   int bak = token_cache[scan[idx]], pt;
83   token_cache[scan[idx]] = vp9_pt_energy_class[token];
84   pt = get_coef_context(nb, token_cache, idx + 1);
85   token_cache[scan[idx]] = bak;
86   return pt;
87 }
88 
optimize_b(MACROBLOCK * mb,int plane,int block,TX_SIZE tx_size,int ctx)89 static int optimize_b(MACROBLOCK *mb, int plane, int block,
90                       TX_SIZE tx_size, int ctx) {
91   MACROBLOCKD *const xd = &mb->e_mbd;
92   struct macroblock_plane *const p = &mb->plane[plane];
93   struct macroblockd_plane *const pd = &xd->plane[plane];
94   const int ref = is_inter_block(&xd->mi[0]->mbmi);
95   vp9_token_state tokens[1025][2];
96   unsigned best_index[1025][2];
97   uint8_t token_cache[1024];
98   const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
99   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
100   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
101   const int eob = p->eobs[block];
102   const PLANE_TYPE type = get_plane_type(plane);
103   const int default_eob = 16 << (tx_size << 1);
104   const int mul = 1 + (tx_size == TX_32X32);
105   const int16_t *dequant_ptr = pd->dequant;
106   const uint8_t *const band_translate = get_band_translate(tx_size);
107   const scan_order *const so = get_scan(xd, tx_size, type, block);
108   const int16_t *const scan = so->scan;
109   const int16_t *const nb = so->neighbors;
110   int next = eob, sz = 0;
111   int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
112   int64_t rd_cost0, rd_cost1;
113   int rate0, rate1, error0, error1;
114   int16_t t0, t1;
115   EXTRABIT e0;
116   int best, band, pt, i, final_eob;
117 #if CONFIG_VP9_HIGHBITDEPTH
118   const int16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
119 #else
120   const int16_t *cat6_high_cost = vp9_get_high_cost_table(8);
121 #endif
122 
123   assert((!type && !plane) || (type && plane));
124   assert(eob <= default_eob);
125 
126   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
127   if (!ref)
128     rdmult = (rdmult * 9) >> 4;
129 
130   /* Initialize the sentinel node of the trellis. */
131   tokens[eob][0].rate = 0;
132   tokens[eob][0].error = 0;
133   tokens[eob][0].next = default_eob;
134   tokens[eob][0].token = EOB_TOKEN;
135   tokens[eob][0].qc = 0;
136   tokens[eob][1] = tokens[eob][0];
137 
138   for (i = 0; i < eob; i++)
139     token_cache[scan[i]] =
140         vp9_pt_energy_class[vp9_get_token(qcoeff[scan[i]])];
141 
142   for (i = eob; i-- > 0;) {
143     int base_bits, d2, dx;
144     const int rc = scan[i];
145     int x = qcoeff[rc];
146     /* Only add a trellis state for non-zero coefficients. */
147     if (x) {
148       int shortcut = 0;
149       error0 = tokens[next][0].error;
150       error1 = tokens[next][1].error;
151       /* Evaluate the first possibility for this state. */
152       rate0 = tokens[next][0].rate;
153       rate1 = tokens[next][1].rate;
154       vp9_get_token_extra(x, &t0, &e0);
155       /* Consider both possible successor states. */
156       if (next < default_eob) {
157         band = band_translate[i + 1];
158         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
159         rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
160                                 [tokens[next][0].token];
161         rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
162                                 [tokens[next][1].token];
163       }
164       UPDATE_RD_COST();
165       /* And pick the best. */
166       best = rd_cost1 < rd_cost0;
167       base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
168       dx = mul * (dqcoeff[rc] - coeff[rc]);
169 #if CONFIG_VP9_HIGHBITDEPTH
170       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
171         dx >>= xd->bd - 8;
172       }
173 #endif  // CONFIG_VP9_HIGHBITDEPTH
174       d2 = dx * dx;
175       tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
176       tokens[i][0].error = d2 + (best ? error1 : error0);
177       tokens[i][0].next = next;
178       tokens[i][0].token = t0;
179       tokens[i][0].qc = x;
180       best_index[i][0] = best;
181 
182       /* Evaluate the second possibility for this state. */
183       rate0 = tokens[next][0].rate;
184       rate1 = tokens[next][1].rate;
185 
186       if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
187           (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
188                                                dequant_ptr[rc != 0]))
189         shortcut = 1;
190       else
191         shortcut = 0;
192 
193       if (shortcut) {
194         sz = -(x < 0);
195         x -= 2 * sz + 1;
196       }
197 
198       /* Consider both possible successor states. */
199       if (!x) {
200         /* If we reduced this coefficient to zero, check to see if
201          *  we need to move the EOB back here.
202          */
203         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
204         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
205         e0 = 0;
206       } else {
207         vp9_get_token_extra(x, &t0, &e0);
208         t1 = t0;
209       }
210       if (next < default_eob) {
211         band = band_translate[i + 1];
212         if (t0 != EOB_TOKEN) {
213           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
214           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
215                                   [tokens[next][0].token];
216         }
217         if (t1 != EOB_TOKEN) {
218           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
219           rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
220                                   [tokens[next][1].token];
221         }
222       }
223 
224       UPDATE_RD_COST();
225       /* And pick the best. */
226       best = rd_cost1 < rd_cost0;
227       base_bits = vp9_get_cost(t0, e0, cat6_high_cost);
228 
229       if (shortcut) {
230 #if CONFIG_VP9_HIGHBITDEPTH
231         if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
232           dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
233         } else {
234           dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
235         }
236 #else
237         dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
238 #endif  // CONFIG_VP9_HIGHBITDEPTH
239         d2 = dx * dx;
240       }
241       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
242       tokens[i][1].error = d2 + (best ? error1 : error0);
243       tokens[i][1].next = next;
244       tokens[i][1].token = best ? t1 : t0;
245       tokens[i][1].qc = x;
246       best_index[i][1] = best;
247       /* Finally, make this the new head of the trellis. */
248       next = i;
249     } else {
250       /* There's no choice to make for a zero coefficient, so we don't
251        *  add a new trellis node, but we do need to update the costs.
252        */
253       band = band_translate[i + 1];
254       t0 = tokens[next][0].token;
255       t1 = tokens[next][1].token;
256       /* Update the cost of each path if we're past the EOB token. */
257       if (t0 != EOB_TOKEN) {
258         tokens[next][0].rate +=
259             mb->token_costs[tx_size][type][ref][band][1][0][t0];
260         tokens[next][0].token = ZERO_TOKEN;
261       }
262       if (t1 != EOB_TOKEN) {
263         tokens[next][1].rate +=
264             mb->token_costs[tx_size][type][ref][band][1][0][t1];
265         tokens[next][1].token = ZERO_TOKEN;
266       }
267       best_index[i][0] = best_index[i][1] = 0;
268       /* Don't update next, because we didn't add a new node. */
269     }
270   }
271 
272   /* Now pick the best path through the whole trellis. */
273   band = band_translate[i + 1];
274   rate0 = tokens[next][0].rate;
275   rate1 = tokens[next][1].rate;
276   error0 = tokens[next][0].error;
277   error1 = tokens[next][1].error;
278   t0 = tokens[next][0].token;
279   t1 = tokens[next][1].token;
280   rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
281   rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
282   UPDATE_RD_COST();
283   best = rd_cost1 < rd_cost0;
284   final_eob = -1;
285   memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
286   memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
287   for (i = next; i < eob; i = next) {
288     const int x = tokens[i][best].qc;
289     const int rc = scan[i];
290     if (x) {
291       final_eob = i;
292     }
293 
294     qcoeff[rc] = x;
295     dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
296 
297     next = tokens[i][best].next;
298     best = best_index[i][best];
299   }
300   final_eob++;
301 
302   mb->plane[plane].eobs[block] = final_eob;
303   return final_eob;
304 }
305 
fdct32x32(int rd_transform,const int16_t * src,tran_low_t * dst,int src_stride)306 static INLINE void fdct32x32(int rd_transform,
307                              const int16_t *src, tran_low_t *dst,
308                              int src_stride) {
309   if (rd_transform)
310     vpx_fdct32x32_rd(src, dst, src_stride);
311   else
312     vpx_fdct32x32(src, dst, src_stride);
313 }
314 
315 #if CONFIG_VP9_HIGHBITDEPTH
highbd_fdct32x32(int rd_transform,const int16_t * src,tran_low_t * dst,int src_stride)316 static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
317                                     tran_low_t *dst, int src_stride) {
318   if (rd_transform)
319     vpx_highbd_fdct32x32_rd(src, dst, src_stride);
320   else
321     vpx_highbd_fdct32x32(src, dst, src_stride);
322 }
323 #endif  // CONFIG_VP9_HIGHBITDEPTH
324 
vp9_xform_quant_fp(MACROBLOCK * x,int plane,int block,BLOCK_SIZE plane_bsize,TX_SIZE tx_size)325 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
326                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
327   MACROBLOCKD *const xd = &x->e_mbd;
328   const struct macroblock_plane *const p = &x->plane[plane];
329   const struct macroblockd_plane *const pd = &xd->plane[plane];
330   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
331   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
332   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
333   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
334   uint16_t *const eob = &p->eobs[block];
335   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
336   int i, j;
337   const int16_t *src_diff;
338   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
339   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
340 
341 #if CONFIG_VP9_HIGHBITDEPTH
342   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
343     switch (tx_size) {
344       case TX_32X32:
345         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
346         vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
347                                      p->round_fp, p->quant_fp, p->quant_shift,
348                                      qcoeff, dqcoeff, pd->dequant,
349                                      eob, scan_order->scan,
350                                      scan_order->iscan);
351         break;
352       case TX_16X16:
353         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
354         vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
355                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
356                                pd->dequant, eob,
357                                scan_order->scan, scan_order->iscan);
358         break;
359       case TX_8X8:
360         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
361         vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
362                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
363                                pd->dequant, eob,
364                                scan_order->scan, scan_order->iscan);
365         break;
366       case TX_4X4:
367         x->fwd_txm4x4(src_diff, coeff, diff_stride);
368         vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
369                                p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
370                                pd->dequant, eob,
371                                scan_order->scan, scan_order->iscan);
372         break;
373       default:
374         assert(0);
375     }
376     return;
377   }
378 #endif  // CONFIG_VP9_HIGHBITDEPTH
379 
380   switch (tx_size) {
381     case TX_32X32:
382       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
383       vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
384                             p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
385                             pd->dequant, eob, scan_order->scan,
386                             scan_order->iscan);
387       break;
388     case TX_16X16:
389       vpx_fdct16x16(src_diff, coeff, diff_stride);
390       vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
391                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
392                       pd->dequant, eob,
393                       scan_order->scan, scan_order->iscan);
394       break;
395     case TX_8X8:
396       vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
397                         x->skip_block, p->zbin, p->round_fp,
398                         p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
399                         pd->dequant, eob,
400                         scan_order->scan, scan_order->iscan);
401       break;
402     case TX_4X4:
403       x->fwd_txm4x4(src_diff, coeff, diff_stride);
404       vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
405                       p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
406                       pd->dequant, eob,
407                       scan_order->scan, scan_order->iscan);
408       break;
409     default:
410       assert(0);
411       break;
412   }
413 }
414 
vp9_xform_quant_dc(MACROBLOCK * x,int plane,int block,BLOCK_SIZE plane_bsize,TX_SIZE tx_size)415 void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
416                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
417   MACROBLOCKD *const xd = &x->e_mbd;
418   const struct macroblock_plane *const p = &x->plane[plane];
419   const struct macroblockd_plane *const pd = &xd->plane[plane];
420   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
421   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
422   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
423   uint16_t *const eob = &p->eobs[block];
424   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
425   int i, j;
426   const int16_t *src_diff;
427 
428   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
429   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
430 
431 #if CONFIG_VP9_HIGHBITDEPTH
432   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
433     switch (tx_size) {
434       case TX_32X32:
435         vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
436         vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
437                                      p->quant_fp[0], qcoeff, dqcoeff,
438                                      pd->dequant[0], eob);
439         break;
440       case TX_16X16:
441         vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
442         vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
443                                p->quant_fp[0], qcoeff, dqcoeff,
444                                pd->dequant[0], eob);
445         break;
446       case TX_8X8:
447         vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
448         vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
449                                p->quant_fp[0], qcoeff, dqcoeff,
450                                pd->dequant[0], eob);
451         break;
452       case TX_4X4:
453         x->fwd_txm4x4(src_diff, coeff, diff_stride);
454         vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
455                                p->quant_fp[0], qcoeff, dqcoeff,
456                                pd->dequant[0], eob);
457         break;
458       default:
459         assert(0);
460     }
461     return;
462   }
463 #endif  // CONFIG_VP9_HIGHBITDEPTH
464 
465   switch (tx_size) {
466     case TX_32X32:
467       vpx_fdct32x32_1(src_diff, coeff, diff_stride);
468       vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
469                             p->quant_fp[0], qcoeff, dqcoeff,
470                             pd->dequant[0], eob);
471       break;
472     case TX_16X16:
473       vpx_fdct16x16_1(src_diff, coeff, diff_stride);
474       vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
475                      p->quant_fp[0], qcoeff, dqcoeff,
476                      pd->dequant[0], eob);
477       break;
478     case TX_8X8:
479       vpx_fdct8x8_1(src_diff, coeff, diff_stride);
480       vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
481                       p->quant_fp[0], qcoeff, dqcoeff,
482                       pd->dequant[0], eob);
483       break;
484     case TX_4X4:
485       x->fwd_txm4x4(src_diff, coeff, diff_stride);
486       vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
487                       p->quant_fp[0], qcoeff, dqcoeff,
488                       pd->dequant[0], eob);
489       break;
490     default:
491       assert(0);
492       break;
493   }
494 }
495 
vp9_xform_quant(MACROBLOCK * x,int plane,int block,BLOCK_SIZE plane_bsize,TX_SIZE tx_size)496 void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
497                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
498   MACROBLOCKD *const xd = &x->e_mbd;
499   const struct macroblock_plane *const p = &x->plane[plane];
500   const struct macroblockd_plane *const pd = &xd->plane[plane];
501   const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
502   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
503   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
504   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
505   uint16_t *const eob = &p->eobs[block];
506   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
507   int i, j;
508   const int16_t *src_diff;
509   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
510   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
511 
512 #if CONFIG_VP9_HIGHBITDEPTH
513   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
514      switch (tx_size) {
515       case TX_32X32:
516         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
517         vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
518                                     p->round, p->quant, p->quant_shift, qcoeff,
519                                     dqcoeff, pd->dequant, eob,
520                                     scan_order->scan, scan_order->iscan);
521         break;
522       case TX_16X16:
523         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
524         vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
525                               p->quant, p->quant_shift, qcoeff, dqcoeff,
526                               pd->dequant, eob,
527                               scan_order->scan, scan_order->iscan);
528         break;
529       case TX_8X8:
530         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
531         vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
532                               p->quant, p->quant_shift, qcoeff, dqcoeff,
533                               pd->dequant, eob,
534                               scan_order->scan, scan_order->iscan);
535         break;
536       case TX_4X4:
537         x->fwd_txm4x4(src_diff, coeff, diff_stride);
538         vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
539                               p->quant, p->quant_shift, qcoeff, dqcoeff,
540                               pd->dequant, eob,
541                               scan_order->scan, scan_order->iscan);
542         break;
543       default:
544         assert(0);
545     }
546     return;
547   }
548 #endif  // CONFIG_VP9_HIGHBITDEPTH
549 
550   switch (tx_size) {
551     case TX_32X32:
552       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
553       vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
554                            p->quant, p->quant_shift, qcoeff, dqcoeff,
555                            pd->dequant, eob, scan_order->scan,
556                            scan_order->iscan);
557       break;
558     case TX_16X16:
559       vpx_fdct16x16(src_diff, coeff, diff_stride);
560       vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
561                      p->quant, p->quant_shift, qcoeff, dqcoeff,
562                      pd->dequant, eob,
563                      scan_order->scan, scan_order->iscan);
564       break;
565     case TX_8X8:
566       vpx_fdct8x8(src_diff, coeff, diff_stride);
567       vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
568                      p->quant, p->quant_shift, qcoeff, dqcoeff,
569                      pd->dequant, eob,
570                      scan_order->scan, scan_order->iscan);
571       break;
572     case TX_4X4:
573       x->fwd_txm4x4(src_diff, coeff, diff_stride);
574       vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
575                      p->quant, p->quant_shift, qcoeff, dqcoeff,
576                      pd->dequant, eob,
577                      scan_order->scan, scan_order->iscan);
578       break;
579     default:
580       assert(0);
581       break;
582   }
583 }
584 
encode_block(int plane,int block,BLOCK_SIZE plane_bsize,TX_SIZE tx_size,void * arg)585 static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
586                          TX_SIZE tx_size, void *arg) {
587   struct encode_b_args *const args = arg;
588   MACROBLOCK *const x = args->x;
589   MACROBLOCKD *const xd = &x->e_mbd;
590   struct optimize_ctx *const ctx = args->ctx;
591   struct macroblock_plane *const p = &x->plane[plane];
592   struct macroblockd_plane *const pd = &xd->plane[plane];
593   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
594   int i, j;
595   uint8_t *dst;
596   ENTROPY_CONTEXT *a, *l;
597   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
598   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
599   a = &ctx->ta[plane][i];
600   l = &ctx->tl[plane][j];
601 
602   // TODO(jingning): per transformed block zero forcing only enabled for
603   // luma component. will integrate chroma components as well.
604   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
605     p->eobs[block] = 0;
606     *a = *l = 0;
607     return;
608   }
609 
610   if (!x->skip_recode) {
611     if (x->quant_fp) {
612       // Encoding process for rtc mode
613       if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
614         // skip forward transform
615         p->eobs[block] = 0;
616         *a = *l = 0;
617         return;
618       } else {
619         vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
620       }
621     } else {
622       if (max_txsize_lookup[plane_bsize] == tx_size) {
623         int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
624         if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
625           // full forward transform and quantization
626           vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
627         } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
628           // fast path forward transform and quantization
629           vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
630         } else {
631           // skip forward transform
632           p->eobs[block] = 0;
633           *a = *l = 0;
634           return;
635         }
636       } else {
637         vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
638       }
639     }
640   }
641 
642   if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
643     const int ctx = combine_entropy_contexts(*a, *l);
644     *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
645   } else {
646     *a = *l = p->eobs[block] > 0;
647   }
648 
649   if (p->eobs[block])
650     *(args->skip) = 0;
651 
652   if (x->skip_encode || p->eobs[block] == 0)
653     return;
654 #if CONFIG_VP9_HIGHBITDEPTH
655   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
656     switch (tx_size) {
657       case TX_32X32:
658         vp9_highbd_idct32x32_add(dqcoeff, dst, pd->dst.stride,
659                                  p->eobs[block], xd->bd);
660         break;
661       case TX_16X16:
662         vp9_highbd_idct16x16_add(dqcoeff, dst, pd->dst.stride,
663                                  p->eobs[block], xd->bd);
664         break;
665       case TX_8X8:
666         vp9_highbd_idct8x8_add(dqcoeff, dst, pd->dst.stride,
667                                p->eobs[block], xd->bd);
668         break;
669       case TX_4X4:
670         // this is like vp9_short_idct4x4 but has a special case around eob<=1
671         // which is significant (not just an optimization) for the lossless
672         // case.
673         x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride,
674                            p->eobs[block], xd->bd);
675         break;
676       default:
677         assert(0 && "Invalid transform size");
678     }
679     return;
680   }
681 #endif  // CONFIG_VP9_HIGHBITDEPTH
682 
683   switch (tx_size) {
684     case TX_32X32:
685       vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
686       break;
687     case TX_16X16:
688       vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
689       break;
690     case TX_8X8:
691       vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
692       break;
693     case TX_4X4:
694       // this is like vp9_short_idct4x4 but has a special case around eob<=1
695       // which is significant (not just an optimization) for the lossless
696       // case.
697       x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
698       break;
699     default:
700       assert(0 && "Invalid transform size");
701       break;
702   }
703 }
704 
encode_block_pass1(int plane,int block,BLOCK_SIZE plane_bsize,TX_SIZE tx_size,void * arg)705 static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
706                                TX_SIZE tx_size, void *arg) {
707   MACROBLOCK *const x = (MACROBLOCK *)arg;
708   MACROBLOCKD *const xd = &x->e_mbd;
709   struct macroblock_plane *const p = &x->plane[plane];
710   struct macroblockd_plane *const pd = &xd->plane[plane];
711   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
712   int i, j;
713   uint8_t *dst;
714   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
715   dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
716 
717   vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
718 
719   if (p->eobs[block] > 0) {
720 #if CONFIG_VP9_HIGHBITDEPTH
721     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
722        x->highbd_itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block], xd->bd);
723        return;
724     }
725 #endif  // CONFIG_VP9_HIGHBITDEPTH
726     x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
727   }
728 }
729 
vp9_encode_sby_pass1(MACROBLOCK * x,BLOCK_SIZE bsize)730 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
731   vp9_subtract_plane(x, bsize, 0);
732   vp9_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
733                                          encode_block_pass1, x);
734 }
735 
vp9_encode_sb(MACROBLOCK * x,BLOCK_SIZE bsize)736 void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
737   MACROBLOCKD *const xd = &x->e_mbd;
738   struct optimize_ctx ctx;
739   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
740   struct encode_b_args arg = {x, &ctx, &mbmi->skip};
741   int plane;
742 
743   mbmi->skip = 1;
744 
745   if (x->skip)
746     return;
747 
748   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
749     if (!x->skip_recode)
750       vp9_subtract_plane(x, bsize, plane);
751 
752     if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
753       const struct macroblockd_plane* const pd = &xd->plane[plane];
754       const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
755       vp9_get_entropy_contexts(bsize, tx_size, pd,
756                                ctx.ta[plane], ctx.tl[plane]);
757     }
758 
759     vp9_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
760                                            &arg);
761   }
762 }
763 
vp9_encode_block_intra(int plane,int block,BLOCK_SIZE plane_bsize,TX_SIZE tx_size,void * arg)764 void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
765                             TX_SIZE tx_size, void *arg) {
766   struct encode_b_args* const args = arg;
767   MACROBLOCK *const x = args->x;
768   MACROBLOCKD *const xd = &x->e_mbd;
769   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
770   struct macroblock_plane *const p = &x->plane[plane];
771   struct macroblockd_plane *const pd = &xd->plane[plane];
772   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
773   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
774   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
775   const scan_order *scan_order;
776   TX_TYPE tx_type = DCT_DCT;
777   PREDICTION_MODE mode;
778   const int bwl = b_width_log2_lookup[plane_bsize];
779   const int diff_stride = 4 * (1 << bwl);
780   uint8_t *src, *dst;
781   int16_t *src_diff;
782   uint16_t *eob = &p->eobs[block];
783   const int src_stride = p->src.stride;
784   const int dst_stride = pd->dst.stride;
785   int i, j;
786   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
787   dst = &pd->dst.buf[4 * (j * dst_stride + i)];
788   src = &p->src.buf[4 * (j * src_stride + i)];
789   src_diff = &p->src_diff[4 * (j * diff_stride + i)];
790 
791   if (tx_size == TX_4X4) {
792     tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
793     scan_order = &vp9_scan_orders[TX_4X4][tx_type];
794     mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
795   } else {
796     mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
797     if (tx_size == TX_32X32) {
798       scan_order = &vp9_default_scan_orders[TX_32X32];
799     } else {
800       tx_type = get_tx_type(get_plane_type(plane), xd);
801       scan_order = &vp9_scan_orders[tx_size][tx_type];
802     }
803   }
804 
805   vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
806                           x->skip_encode ? src_stride : dst_stride,
807                           dst, dst_stride, i, j, plane);
808 
809 #if CONFIG_VP9_HIGHBITDEPTH
810   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
811     switch (tx_size) {
812       case TX_32X32:
813         if (!x->skip_recode) {
814           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
815                                     src, src_stride, dst, dst_stride, xd->bd);
816           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
817           vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
818                                       p->round, p->quant, p->quant_shift,
819                                       qcoeff, dqcoeff, pd->dequant, eob,
820                                       scan_order->scan, scan_order->iscan);
821         }
822         if (!x->skip_encode && *eob) {
823           vp9_highbd_idct32x32_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
824         }
825         break;
826       case TX_16X16:
827         if (!x->skip_recode) {
828           vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
829                                     src, src_stride, dst, dst_stride, xd->bd);
830           if (tx_type == DCT_DCT)
831             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
832           else
833             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
834           vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
835                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
836                                 pd->dequant, eob,
837                                 scan_order->scan, scan_order->iscan);
838         }
839         if (!x->skip_encode && *eob) {
840           vp9_highbd_iht16x16_add(tx_type, dqcoeff, dst, dst_stride,
841                                   *eob, xd->bd);
842         }
843         break;
844       case TX_8X8:
845         if (!x->skip_recode) {
846           vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
847                                     src, src_stride, dst, dst_stride, xd->bd);
848           if (tx_type == DCT_DCT)
849             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
850           else
851             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
852           vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
853                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
854                                 pd->dequant, eob,
855                                 scan_order->scan, scan_order->iscan);
856         }
857         if (!x->skip_encode && *eob) {
858           vp9_highbd_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob,
859                                 xd->bd);
860         }
861         break;
862       case TX_4X4:
863         if (!x->skip_recode) {
864           vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
865                                     src, src_stride, dst, dst_stride, xd->bd);
866           if (tx_type != DCT_DCT)
867             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
868           else
869             x->fwd_txm4x4(src_diff, coeff, diff_stride);
870           vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
871                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
872                                 pd->dequant, eob,
873                                 scan_order->scan, scan_order->iscan);
874         }
875 
876         if (!x->skip_encode && *eob) {
877           if (tx_type == DCT_DCT) {
878             // this is like vp9_short_idct4x4 but has a special case around
879             // eob<=1 which is significant (not just an optimization) for the
880             // lossless case.
881             x->highbd_itxm_add(dqcoeff, dst, dst_stride, *eob, xd->bd);
882           } else {
883             vp9_highbd_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type, xd->bd);
884           }
885         }
886         break;
887       default:
888         assert(0);
889         return;
890     }
891     if (*eob)
892       *(args->skip) = 0;
893     return;
894   }
895 #endif  // CONFIG_VP9_HIGHBITDEPTH
896 
897   switch (tx_size) {
898     case TX_32X32:
899       if (!x->skip_recode) {
900         vpx_subtract_block(32, 32, src_diff, diff_stride,
901                            src, src_stride, dst, dst_stride);
902         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
903         vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
904                              p->quant, p->quant_shift, qcoeff, dqcoeff,
905                              pd->dequant, eob, scan_order->scan,
906                              scan_order->iscan);
907       }
908       if (!x->skip_encode && *eob)
909         vp9_idct32x32_add(dqcoeff, dst, dst_stride, *eob);
910       break;
911     case TX_16X16:
912       if (!x->skip_recode) {
913         vpx_subtract_block(16, 16, src_diff, diff_stride,
914                            src, src_stride, dst, dst_stride);
915         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
916         vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
917                        p->quant, p->quant_shift, qcoeff, dqcoeff,
918                        pd->dequant, eob, scan_order->scan,
919                        scan_order->iscan);
920       }
921       if (!x->skip_encode && *eob)
922         vp9_iht16x16_add(tx_type, dqcoeff, dst, dst_stride, *eob);
923       break;
924     case TX_8X8:
925       if (!x->skip_recode) {
926         vpx_subtract_block(8, 8, src_diff, diff_stride,
927                            src, src_stride, dst, dst_stride);
928         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
929         vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
930                        p->quant_shift, qcoeff, dqcoeff,
931                        pd->dequant, eob, scan_order->scan,
932                        scan_order->iscan);
933       }
934       if (!x->skip_encode && *eob)
935         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
936       break;
937     case TX_4X4:
938       if (!x->skip_recode) {
939         vpx_subtract_block(4, 4, src_diff, diff_stride,
940                            src, src_stride, dst, dst_stride);
941         if (tx_type != DCT_DCT)
942           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
943         else
944           x->fwd_txm4x4(src_diff, coeff, diff_stride);
945         vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
946                        p->quant_shift, qcoeff, dqcoeff,
947                        pd->dequant, eob, scan_order->scan,
948                        scan_order->iscan);
949       }
950 
951       if (!x->skip_encode && *eob) {
952         if (tx_type == DCT_DCT)
953           // this is like vp9_short_idct4x4 but has a special case around eob<=1
954           // which is significant (not just an optimization) for the lossless
955           // case.
956           x->itxm_add(dqcoeff, dst, dst_stride, *eob);
957         else
958           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
959       }
960       break;
961     default:
962       assert(0);
963       break;
964   }
965   if (*eob)
966     *(args->skip) = 0;
967 }
968 
vp9_encode_intra_block_plane(MACROBLOCK * x,BLOCK_SIZE bsize,int plane)969 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
970   const MACROBLOCKD *const xd = &x->e_mbd;
971   struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
972 
973   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
974                                          vp9_encode_block_intra, &arg);
975 }
976