1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12
13 #include "vpx_config.h"
14 #include "vp8_rtcd.h"
15 #include "encodemb.h"
16 #include "vp8/common/reconinter.h"
17 #include "vp8/encoder/quantize.h"
18 #include "tokenize.h"
19 #include "vp8/common/invtrans.h"
20 #include "vpx_mem/vpx_mem.h"
21 #include "rdopt.h"
22
vp8_subtract_b(BLOCK * be,BLOCKD * bd,int pitch)23 void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
24 unsigned char *src_ptr = (*(be->base_src) + be->src);
25 short *diff_ptr = be->src_diff;
26 unsigned char *pred_ptr = bd->predictor;
27 int src_stride = be->src_stride;
28
29 vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride, pred_ptr,
30 pitch);
31 }
32
vp8_subtract_mbuv(short * diff,unsigned char * usrc,unsigned char * vsrc,int src_stride,unsigned char * upred,unsigned char * vpred,int pred_stride)33 void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
34 int src_stride, unsigned char *upred,
35 unsigned char *vpred, int pred_stride) {
36 short *udiff = diff + 256;
37 short *vdiff = diff + 320;
38
39 vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
40 vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
41 }
42
vp8_subtract_mby(short * diff,unsigned char * src,int src_stride,unsigned char * pred,int pred_stride)43 void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
44 unsigned char *pred, int pred_stride) {
45 vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
46 }
47
vp8_subtract_mb(MACROBLOCK * x)48 static void vp8_subtract_mb(MACROBLOCK *x) {
49 BLOCK *b = &x->block[0];
50
51 vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride,
52 x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
53 vp8_subtract_mbuv(x->src_diff, x->src.u_buffer, x->src.v_buffer,
54 x->src.uv_stride, x->e_mbd.dst.u_buffer,
55 x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride);
56 }
57
build_dcblock(MACROBLOCK * x)58 static void build_dcblock(MACROBLOCK *x) {
59 short *src_diff_ptr = &x->src_diff[384];
60 int i;
61
62 for (i = 0; i < 16; ++i) {
63 src_diff_ptr[i] = x->coeff[i * 16];
64 }
65 }
66
vp8_transform_mbuv(MACROBLOCK * x)67 void vp8_transform_mbuv(MACROBLOCK *x) {
68 int i;
69
70 for (i = 16; i < 24; i += 2) {
71 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
72 }
73 }
74
vp8_transform_intra_mby(MACROBLOCK * x)75 void vp8_transform_intra_mby(MACROBLOCK *x) {
76 int i;
77
78 for (i = 0; i < 16; i += 2) {
79 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
80 }
81
82 /* build dc block from 16 y dc values */
83 build_dcblock(x);
84
85 /* do 2nd order transform on the dc block */
86 x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
87 }
88
transform_mb(MACROBLOCK * x)89 static void transform_mb(MACROBLOCK *x) {
90 int i;
91
92 for (i = 0; i < 16; i += 2) {
93 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
94 }
95
96 /* build dc block from 16 y dc values */
97 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) build_dcblock(x);
98
99 for (i = 16; i < 24; i += 2) {
100 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
101 }
102
103 /* do 2nd order transform on the dc block */
104 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
105 x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
106 }
107 }
108
transform_mby(MACROBLOCK * x)109 static void transform_mby(MACROBLOCK *x) {
110 int i;
111
112 for (i = 0; i < 16; i += 2) {
113 x->short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
114 }
115
116 /* build dc block from 16 y dc values */
117 if (x->e_mbd.mode_info_context->mbmi.mode != SPLITMV) {
118 build_dcblock(x);
119 x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
120 }
121 }
122
123 #define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
124
125 typedef struct vp8_token_state vp8_token_state;
126
127 struct vp8_token_state {
128 int rate;
129 int error;
130 signed char next;
131 signed char token;
132 short qc;
133 };
134
135 /* TODO: experiments to find optimal multiple numbers */
136 #define Y1_RD_MULT 4
137 #define UV_RD_MULT 2
138 #define Y2_RD_MULT 16
139
140 static const int plane_rd_mult[4] = { Y1_RD_MULT, Y2_RD_MULT, UV_RD_MULT,
141 Y1_RD_MULT };
142
optimize_b(MACROBLOCK * mb,int ib,int type,ENTROPY_CONTEXT * a,ENTROPY_CONTEXT * l)143 static void optimize_b(MACROBLOCK *mb, int ib, int type, ENTROPY_CONTEXT *a,
144 ENTROPY_CONTEXT *l) {
145 BLOCK *b;
146 BLOCKD *d;
147 vp8_token_state tokens[17][2];
148 unsigned best_mask[2];
149 const short *dequant_ptr;
150 const short *coeff_ptr;
151 short *qcoeff_ptr;
152 short *dqcoeff_ptr;
153 int eob;
154 int i0;
155 int rc;
156 int x;
157 int sz = 0;
158 int next;
159 int rdmult;
160 int rddiv;
161 int final_eob;
162 int rd_cost0;
163 int rd_cost1;
164 int rate0;
165 int rate1;
166 int error0;
167 int error1;
168 int t0;
169 int t1;
170 int best;
171 int band;
172 int pt;
173 int i;
174 int err_mult = plane_rd_mult[type];
175
176 b = &mb->block[ib];
177 d = &mb->e_mbd.block[ib];
178
179 dequant_ptr = d->dequant;
180 coeff_ptr = b->coeff;
181 qcoeff_ptr = d->qcoeff;
182 dqcoeff_ptr = d->dqcoeff;
183 i0 = !type;
184 eob = *d->eob;
185
186 /* Now set up a Viterbi trellis to evaluate alternative roundings. */
187 rdmult = mb->rdmult * err_mult;
188 if (mb->e_mbd.mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
189 rdmult = (rdmult * 9) >> 4;
190 }
191
192 rddiv = mb->rddiv;
193 best_mask[0] = best_mask[1] = 0;
194 /* Initialize the sentinel node of the trellis. */
195 tokens[eob][0].rate = 0;
196 tokens[eob][0].error = 0;
197 tokens[eob][0].next = 16;
198 tokens[eob][0].token = DCT_EOB_TOKEN;
199 tokens[eob][0].qc = 0;
200 *(tokens[eob] + 1) = *(tokens[eob] + 0);
201 next = eob;
202 for (i = eob; i-- > i0;) {
203 int base_bits;
204 int d2;
205 int dx;
206
207 rc = vp8_default_zig_zag1d[i];
208 x = qcoeff_ptr[rc];
209 /* Only add a trellis state for non-zero coefficients. */
210 if (x) {
211 int shortcut = 0;
212 error0 = tokens[next][0].error;
213 error1 = tokens[next][1].error;
214 /* Evaluate the first possibility for this state. */
215 rate0 = tokens[next][0].rate;
216 rate1 = tokens[next][1].rate;
217 t0 = (vp8_dct_value_tokens_ptr + x)->Token;
218 /* Consider both possible successor states. */
219 if (next < 16) {
220 band = vp8_coef_bands[i + 1];
221 pt = vp8_prev_token_class[t0];
222 rate0 += mb->token_costs[type][band][pt][tokens[next][0].token];
223 rate1 += mb->token_costs[type][band][pt][tokens[next][1].token];
224 }
225 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
226 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
227 if (rd_cost0 == rd_cost1) {
228 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
229 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
230 }
231 /* And pick the best. */
232 best = rd_cost1 < rd_cost0;
233 base_bits = *(vp8_dct_value_cost_ptr + x);
234 dx = dqcoeff_ptr[rc] - coeff_ptr[rc];
235 d2 = dx * dx;
236 tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
237 tokens[i][0].error = d2 + (best ? error1 : error0);
238 tokens[i][0].next = next;
239 tokens[i][0].token = t0;
240 tokens[i][0].qc = x;
241 best_mask[0] |= best << i;
242 /* Evaluate the second possibility for this state. */
243 rate0 = tokens[next][0].rate;
244 rate1 = tokens[next][1].rate;
245
246 if ((abs(x) * dequant_ptr[rc] > abs(coeff_ptr[rc])) &&
247 (abs(x) * dequant_ptr[rc] < abs(coeff_ptr[rc]) + dequant_ptr[rc])) {
248 shortcut = 1;
249 } else {
250 shortcut = 0;
251 }
252
253 if (shortcut) {
254 sz = -(x < 0);
255 x -= 2 * sz + 1;
256 }
257
258 /* Consider both possible successor states. */
259 if (!x) {
260 /* If we reduced this coefficient to zero, check to see if
261 * we need to move the EOB back here.
262 */
263 t0 =
264 tokens[next][0].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN;
265 t1 =
266 tokens[next][1].token == DCT_EOB_TOKEN ? DCT_EOB_TOKEN : ZERO_TOKEN;
267 } else {
268 t0 = t1 = (vp8_dct_value_tokens_ptr + x)->Token;
269 }
270 if (next < 16) {
271 band = vp8_coef_bands[i + 1];
272 if (t0 != DCT_EOB_TOKEN) {
273 pt = vp8_prev_token_class[t0];
274 rate0 += mb->token_costs[type][band][pt][tokens[next][0].token];
275 }
276 if (t1 != DCT_EOB_TOKEN) {
277 pt = vp8_prev_token_class[t1];
278 rate1 += mb->token_costs[type][band][pt][tokens[next][1].token];
279 }
280 }
281
282 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
283 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
284 if (rd_cost0 == rd_cost1) {
285 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
286 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
287 }
288 /* And pick the best. */
289 best = rd_cost1 < rd_cost0;
290 base_bits = *(vp8_dct_value_cost_ptr + x);
291
292 if (shortcut) {
293 dx -= (dequant_ptr[rc] + sz) ^ sz;
294 d2 = dx * dx;
295 }
296 tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
297 tokens[i][1].error = d2 + (best ? error1 : error0);
298 tokens[i][1].next = next;
299 tokens[i][1].token = best ? t1 : t0;
300 tokens[i][1].qc = x;
301 best_mask[1] |= best << i;
302 /* Finally, make this the new head of the trellis. */
303 next = i;
304 }
305 /* There's no choice to make for a zero coefficient, so we don't
306 * add a new trellis node, but we do need to update the costs.
307 */
308 else {
309 band = vp8_coef_bands[i + 1];
310 t0 = tokens[next][0].token;
311 t1 = tokens[next][1].token;
312 /* Update the cost of each path if we're past the EOB token. */
313 if (t0 != DCT_EOB_TOKEN) {
314 tokens[next][0].rate += mb->token_costs[type][band][0][t0];
315 tokens[next][0].token = ZERO_TOKEN;
316 }
317 if (t1 != DCT_EOB_TOKEN) {
318 tokens[next][1].rate += mb->token_costs[type][band][0][t1];
319 tokens[next][1].token = ZERO_TOKEN;
320 }
321 /* Don't update next, because we didn't add a new node. */
322 }
323 }
324
325 /* Now pick the best path through the whole trellis. */
326 band = vp8_coef_bands[i + 1];
327 VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
328 rate0 = tokens[next][0].rate;
329 rate1 = tokens[next][1].rate;
330 error0 = tokens[next][0].error;
331 error1 = tokens[next][1].error;
332 t0 = tokens[next][0].token;
333 t1 = tokens[next][1].token;
334 rate0 += mb->token_costs[type][band][pt][t0];
335 rate1 += mb->token_costs[type][band][pt][t1];
336 rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);
337 rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);
338 if (rd_cost0 == rd_cost1) {
339 rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);
340 rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);
341 }
342 best = rd_cost1 < rd_cost0;
343 final_eob = i0 - 1;
344 for (i = next; i < eob; i = next) {
345 x = tokens[i][best].qc;
346 if (x) final_eob = i;
347 rc = vp8_default_zig_zag1d[i];
348 qcoeff_ptr[rc] = x;
349 dqcoeff_ptr[rc] = x * dequant_ptr[rc];
350 next = tokens[i][best].next;
351 best = (best_mask[best] >> i) & 1;
352 }
353 final_eob++;
354
355 *a = *l = (final_eob != !type);
356 *d->eob = (char)final_eob;
357 }
check_reset_2nd_coeffs(MACROBLOCKD * x,int type,ENTROPY_CONTEXT * a,ENTROPY_CONTEXT * l)358 static void check_reset_2nd_coeffs(MACROBLOCKD *x, int type, ENTROPY_CONTEXT *a,
359 ENTROPY_CONTEXT *l) {
360 int sum = 0;
361 int i;
362 BLOCKD *bd = &x->block[24];
363
364 if (bd->dequant[0] >= 35 && bd->dequant[1] >= 35) return;
365
366 for (i = 0; i < (*bd->eob); ++i) {
367 int coef = bd->dqcoeff[vp8_default_zig_zag1d[i]];
368 sum += (coef >= 0) ? coef : -coef;
369 if (sum >= 35) return;
370 }
371 /**************************************************************************
372 our inverse hadamard transform effectively is weighted sum of all 16 inputs
373 with weight either 1 or -1. It has a last stage scaling of (sum+3)>>3. And
374 dc only idct is (dc+4)>>3. So if all the sums are between -35 and 29, the
375 output after inverse wht and idct will be all zero. A sum of absolute value
376 smaller than 35 guarantees all 16 different (+1/-1) weighted sums in wht
377 fall between -35 and +35.
378 **************************************************************************/
379 if (sum < 35) {
380 for (i = 0; i < (*bd->eob); ++i) {
381 int rc = vp8_default_zig_zag1d[i];
382 bd->qcoeff[rc] = 0;
383 bd->dqcoeff[rc] = 0;
384 }
385 *bd->eob = 0;
386 *a = *l = (*bd->eob != !type);
387 }
388 }
389
optimize_mb(MACROBLOCK * x)390 static void optimize_mb(MACROBLOCK *x) {
391 int b;
392 int type;
393 int has_2nd_order;
394
395 ENTROPY_CONTEXT_PLANES t_above, t_left;
396 ENTROPY_CONTEXT *ta;
397 ENTROPY_CONTEXT *tl;
398
399 memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
400 memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
401
402 ta = (ENTROPY_CONTEXT *)&t_above;
403 tl = (ENTROPY_CONTEXT *)&t_left;
404
405 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED &&
406 x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
407 type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
408
409 for (b = 0; b < 16; ++b) {
410 optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]);
411 }
412
413 for (b = 16; b < 24; ++b) {
414 optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b],
415 tl + vp8_block2left[b]);
416 }
417
418 if (has_2nd_order) {
419 b = 24;
420 optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b],
421 tl + vp8_block2left[b]);
422 check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b],
423 tl + vp8_block2left[b]);
424 }
425 }
426
vp8_optimize_mby(MACROBLOCK * x)427 void vp8_optimize_mby(MACROBLOCK *x) {
428 int b;
429 int type;
430 int has_2nd_order;
431
432 ENTROPY_CONTEXT_PLANES t_above, t_left;
433 ENTROPY_CONTEXT *ta;
434 ENTROPY_CONTEXT *tl;
435
436 if (!x->e_mbd.above_context) return;
437
438 if (!x->e_mbd.left_context) return;
439
440 memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
441 memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
442
443 ta = (ENTROPY_CONTEXT *)&t_above;
444 tl = (ENTROPY_CONTEXT *)&t_left;
445
446 has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED &&
447 x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
448 type = has_2nd_order ? PLANE_TYPE_Y_NO_DC : PLANE_TYPE_Y_WITH_DC;
449
450 for (b = 0; b < 16; ++b) {
451 optimize_b(x, b, type, ta + vp8_block2above[b], tl + vp8_block2left[b]);
452 }
453
454 if (has_2nd_order) {
455 b = 24;
456 optimize_b(x, b, PLANE_TYPE_Y2, ta + vp8_block2above[b],
457 tl + vp8_block2left[b]);
458 check_reset_2nd_coeffs(&x->e_mbd, PLANE_TYPE_Y2, ta + vp8_block2above[b],
459 tl + vp8_block2left[b]);
460 }
461 }
462
vp8_optimize_mbuv(MACROBLOCK * x)463 void vp8_optimize_mbuv(MACROBLOCK *x) {
464 int b;
465 ENTROPY_CONTEXT_PLANES t_above, t_left;
466 ENTROPY_CONTEXT *ta;
467 ENTROPY_CONTEXT *tl;
468
469 if (!x->e_mbd.above_context) return;
470
471 if (!x->e_mbd.left_context) return;
472
473 memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
474 memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
475
476 ta = (ENTROPY_CONTEXT *)&t_above;
477 tl = (ENTROPY_CONTEXT *)&t_left;
478
479 for (b = 16; b < 24; ++b) {
480 optimize_b(x, b, PLANE_TYPE_UV, ta + vp8_block2above[b],
481 tl + vp8_block2left[b]);
482 }
483 }
484
vp8_encode_inter16x16(MACROBLOCK * x)485 void vp8_encode_inter16x16(MACROBLOCK *x) {
486 vp8_build_inter_predictors_mb(&x->e_mbd);
487
488 vp8_subtract_mb(x);
489
490 transform_mb(x);
491
492 vp8_quantize_mb(x);
493
494 if (x->optimize) optimize_mb(x);
495 }
496
497 /* this funciton is used by first pass only */
vp8_encode_inter16x16y(MACROBLOCK * x)498 void vp8_encode_inter16x16y(MACROBLOCK *x) {
499 BLOCK *b = &x->block[0];
500
501 vp8_build_inter16x16_predictors_mby(&x->e_mbd, x->e_mbd.dst.y_buffer,
502 x->e_mbd.dst.y_stride);
503
504 vp8_subtract_mby(x->src_diff, *(b->base_src), b->src_stride,
505 x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride);
506
507 transform_mby(x);
508
509 vp8_quantize_mby(x);
510
511 vp8_inverse_transform_mby(&x->e_mbd);
512 }
513