1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h> // for abs()
17
18 #include "./vp8enci.h"
19 #include "./cost.h"
20
21 #define DO_TRELLIS_I4 1
22 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
23 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
24 #define USE_TDISTO 1
25
26 #define MID_ALPHA 64 // neutral value for susceptibility
27 #define MIN_ALPHA 30 // lowest usable value for susceptibility
28 #define MAX_ALPHA 100 // higher meaningful value for susceptibility
29
30 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
31 // power-law modulation. Must be strictly less than 1.
32
33 #define I4_PENALTY 14000 // Rate-penalty for quick i4/i16 decision
34
35 // number of non-zero coeffs below which we consider the block very flat
36 // (and apply a penalty to complex predictions)
37 #define FLATNESS_LIMIT_I16 10 // I16 mode
38 #define FLATNESS_LIMIT_I4 3 // I4 mode
39 #define FLATNESS_LIMIT_UV 2 // UV mode
40 #define FLATNESS_PENALTY 140 // roughly ~1bit per block
41
42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
43
44 #define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
45
46 // #define DEBUG_BLOCK
47
48 //------------------------------------------------------------------------------
49
50 #if defined(DEBUG_BLOCK)
51
52 #include <stdio.h>
53 #include <stdlib.h>
54
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)55 static void PrintBlockInfo(const VP8EncIterator* const it,
56 const VP8ModeScore* const rd) {
57 int i, j;
58 const int is_i16 = (it->mb_->type_ == 1);
59 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
60 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
61 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
62 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
63 printf("SOURCE / OUTPUT / ABS DELTA\n");
64 for (j = 0; j < 16; ++j) {
65 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
66 printf(" ");
67 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
68 printf(" ");
69 for (i = 0; i < 16; ++i) {
70 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
71 }
72 printf("\n");
73 }
74 printf("\n"); // newline before the U/V block
75 for (j = 0; j < 8; ++j) {
76 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
77 printf(" ");
78 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
79 printf(" ");
80 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
81 printf(" ");
82 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
83 printf(" ");
84 for (i = 0; i < 8; ++i) {
85 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
86 }
87 printf(" ");
88 for (i = 8; i < 16; ++i) {
89 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
90 }
91 printf("\n");
92 }
93 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
94 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
95 (int)rd->score);
96 if (is_i16) {
97 printf("Mode: %d\n", rd->mode_i16);
98 printf("y_dc_levels:");
99 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
100 printf("\n");
101 } else {
102 printf("Modes[16]: ");
103 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
104 printf("\n");
105 }
106 printf("y_ac_levels:\n");
107 for (j = 0; j < 16; ++j) {
108 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
109 printf("%4d ", rd->y_ac_levels[j][i]);
110 }
111 printf("\n");
112 }
113 printf("\n");
114 printf("uv_levels (mode=%d):\n", rd->mode_uv);
115 for (j = 0; j < 8; ++j) {
116 for (i = 0; i < 16; ++i) {
117 printf("%4d ", rd->uv_levels[j][i]);
118 }
119 printf("\n");
120 }
121 }
122
123 #endif // DEBUG_BLOCK
124
125 //------------------------------------------------------------------------------
126
clip(int v,int m,int M)127 static WEBP_INLINE int clip(int v, int m, int M) {
128 return v < m ? m : v > M ? M : v;
129 }
130
131 static const uint8_t kZigzag[16] = {
132 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
133 };
134
135 static const uint8_t kDcTable[128] = {
136 4, 5, 6, 7, 8, 9, 10, 10,
137 11, 12, 13, 14, 15, 16, 17, 17,
138 18, 19, 20, 20, 21, 21, 22, 22,
139 23, 23, 24, 25, 25, 26, 27, 28,
140 29, 30, 31, 32, 33, 34, 35, 36,
141 37, 37, 38, 39, 40, 41, 42, 43,
142 44, 45, 46, 46, 47, 48, 49, 50,
143 51, 52, 53, 54, 55, 56, 57, 58,
144 59, 60, 61, 62, 63, 64, 65, 66,
145 67, 68, 69, 70, 71, 72, 73, 74,
146 75, 76, 76, 77, 78, 79, 80, 81,
147 82, 83, 84, 85, 86, 87, 88, 89,
148 91, 93, 95, 96, 98, 100, 101, 102,
149 104, 106, 108, 110, 112, 114, 116, 118,
150 122, 124, 126, 128, 130, 132, 134, 136,
151 138, 140, 143, 145, 148, 151, 154, 157
152 };
153
154 static const uint16_t kAcTable[128] = {
155 4, 5, 6, 7, 8, 9, 10, 11,
156 12, 13, 14, 15, 16, 17, 18, 19,
157 20, 21, 22, 23, 24, 25, 26, 27,
158 28, 29, 30, 31, 32, 33, 34, 35,
159 36, 37, 38, 39, 40, 41, 42, 43,
160 44, 45, 46, 47, 48, 49, 50, 51,
161 52, 53, 54, 55, 56, 57, 58, 60,
162 62, 64, 66, 68, 70, 72, 74, 76,
163 78, 80, 82, 84, 86, 88, 90, 92,
164 94, 96, 98, 100, 102, 104, 106, 108,
165 110, 112, 114, 116, 119, 122, 125, 128,
166 131, 134, 137, 140, 143, 146, 149, 152,
167 155, 158, 161, 164, 167, 170, 173, 177,
168 181, 185, 189, 193, 197, 201, 205, 209,
169 213, 217, 221, 225, 229, 234, 239, 245,
170 249, 254, 259, 264, 269, 274, 279, 284
171 };
172
173 static const uint16_t kAcTable2[128] = {
174 8, 8, 9, 10, 12, 13, 15, 17,
175 18, 20, 21, 23, 24, 26, 27, 29,
176 31, 32, 34, 35, 37, 38, 40, 41,
177 43, 44, 46, 48, 49, 51, 52, 54,
178 55, 57, 58, 60, 62, 63, 65, 66,
179 68, 69, 71, 72, 74, 75, 77, 79,
180 80, 82, 83, 85, 86, 88, 89, 93,
181 96, 99, 102, 105, 108, 111, 114, 117,
182 120, 124, 127, 130, 133, 136, 139, 142,
183 145, 148, 151, 155, 158, 161, 164, 167,
184 170, 173, 176, 179, 184, 189, 193, 198,
185 203, 207, 212, 217, 221, 226, 230, 235,
186 240, 244, 249, 254, 258, 263, 268, 274,
187 280, 286, 292, 299, 305, 311, 317, 323,
188 330, 336, 342, 348, 354, 362, 370, 379,
189 385, 393, 401, 409, 416, 424, 432, 440
190 };
191
192 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
193 { 96, 110 }, { 96, 108 }, { 110, 115 }
194 };
195
196 // Sharpening by (slightly) raising the hi-frequency coeffs.
197 // Hack-ish but helpful for mid-bitrate range. Use with care.
198 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
199 static const uint8_t kFreqSharpening[16] = {
200 0, 30, 60, 90,
201 30, 60, 90, 90,
202 60, 90, 90, 90,
203 90, 90, 90, 90
204 };
205
206 //------------------------------------------------------------------------------
207 // Initialize quantization parameters in VP8Matrix
208
209 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)210 static int ExpandMatrix(VP8Matrix* const m, int type) {
211 int i, sum;
212 for (i = 0; i < 2; ++i) {
213 const int is_ac_coeff = (i > 0);
214 const int bias = kBiasMatrices[type][is_ac_coeff];
215 m->iq_[i] = (1 << QFIX) / m->q_[i];
216 m->bias_[i] = BIAS(bias);
217 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
218 // * zero if coeff <= zthresh
219 // * non-zero if coeff > zthresh
220 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
221 }
222 for (i = 2; i < 16; ++i) {
223 m->q_[i] = m->q_[1];
224 m->iq_[i] = m->iq_[1];
225 m->bias_[i] = m->bias_[1];
226 m->zthresh_[i] = m->zthresh_[1];
227 }
228 for (sum = 0, i = 0; i < 16; ++i) {
229 if (type == 0) { // we only use sharpening for AC luma coeffs
230 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
231 } else {
232 m->sharpen_[i] = 0;
233 }
234 sum += m->q_[i];
235 }
236 return (sum + 8) >> 4;
237 }
238
SetupMatrices(VP8Encoder * enc)239 static void SetupMatrices(VP8Encoder* enc) {
240 int i;
241 const int tlambda_scale =
242 (enc->method_ >= 4) ? enc->config_->sns_strength
243 : 0;
244 const int num_segments = enc->segment_hdr_.num_segments_;
245 for (i = 0; i < num_segments; ++i) {
246 VP8SegmentInfo* const m = &enc->dqm_[i];
247 const int q = m->quant_;
248 int q4, q16, quv;
249 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
250 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
251
252 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
253 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
254
255 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
256 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
257
258 q4 = ExpandMatrix(&m->y1_, 0);
259 q16 = ExpandMatrix(&m->y2_, 1);
260 quv = ExpandMatrix(&m->uv_, 2);
261
262 m->lambda_i4_ = (3 * q4 * q4) >> 7;
263 m->lambda_i16_ = (3 * q16 * q16);
264 m->lambda_uv_ = (3 * quv * quv) >> 6;
265 m->lambda_mode_ = (1 * q4 * q4) >> 7;
266 m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3;
267 m->lambda_trellis_i16_ = (q16 * q16) >> 2;
268 m->lambda_trellis_uv_ = (quv *quv) << 1;
269 m->tlambda_ = (tlambda_scale * q4) >> 5;
270
271 m->min_disto_ = 10 * m->y1_.q_[0]; // quantization-aware min disto
272 m->max_edge_ = 0;
273 }
274 }
275
276 //------------------------------------------------------------------------------
277 // Initialize filtering parameters
278
279 // Very small filter-strength values have close to no visual effect. So we can
280 // save a little decoding-CPU by turning filtering off for these.
281 #define FSTRENGTH_CUTOFF 2
282
SetupFilterStrength(VP8Encoder * const enc)283 static void SetupFilterStrength(VP8Encoder* const enc) {
284 int i;
285 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
286 const int level0 = 5 * enc->config_->filter_strength;
287 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
288 VP8SegmentInfo* const m = &enc->dqm_[i];
289 // We focus on the quantization of AC coeffs.
290 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
291 const int base_strength =
292 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
293 // Segments with lower complexity ('beta') will be less filtered.
294 const int f = base_strength * level0 / (256 + m->beta_);
295 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
296 }
297 // We record the initial strength (mainly for the case of 1-segment only).
298 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
299 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
300 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
301 }
302
303 //------------------------------------------------------------------------------
304
305 // Note: if you change the values below, remember that the max range
306 // allowed by the syntax for DQ_UV is [-16,16].
307 #define MAX_DQ_UV (6)
308 #define MIN_DQ_UV (-4)
309
310 // We want to emulate jpeg-like behaviour where the expected "good" quality
311 // is around q=75. Internally, our "good" middle is around c=50. So we
312 // map accordingly using linear piece-wise function
QualityToCompression(double c)313 static double QualityToCompression(double c) {
314 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
315 // The file size roughly scales as pow(quantizer, 3.). Actually, the
316 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
317 // in the mid-quant range. So we scale the compressibility inversely to
318 // this power-law: quant ~= compression ^ 1/3. This law holds well for
319 // low quant. Finer modeling for high-quant would make use of kAcTable[]
320 // more explicitly.
321 const double v = pow(linear_c, 1 / 3.);
322 return v;
323 }
324
QualityToJPEGCompression(double c,double alpha)325 static double QualityToJPEGCompression(double c, double alpha) {
326 // We map the complexity 'alpha' and quality setting 'c' to a compression
327 // exponent empirically matched to the compression curve of libjpeg6b.
328 // On average, the WebP output size will be roughly similar to that of a
329 // JPEG file compressed with same quality factor.
330 const double amin = 0.30;
331 const double amax = 0.85;
332 const double exp_min = 0.4;
333 const double exp_max = 0.9;
334 const double slope = (exp_min - exp_max) / (amax - amin);
335 // Linearly interpolate 'expn' from exp_min to exp_max
336 // in the [amin, amax] range.
337 const double expn = (alpha > amax) ? exp_min
338 : (alpha < amin) ? exp_max
339 : exp_max + slope * (alpha - amin);
340 const double v = pow(c, expn);
341 return v;
342 }
343
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)344 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
345 const VP8SegmentInfo* const S2) {
346 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
347 }
348
SimplifySegments(VP8Encoder * const enc)349 static void SimplifySegments(VP8Encoder* const enc) {
350 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
351 const int num_segments = enc->segment_hdr_.num_segments_;
352 int num_final_segments = 1;
353 int s1, s2;
354 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
355 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
356 int found = 0;
357 // check if we already have similar segment
358 for (s2 = 0; s2 < num_final_segments; ++s2) {
359 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
360 if (SegmentsAreEquivalent(S1, S2)) {
361 found = 1;
362 break;
363 }
364 }
365 map[s1] = s2;
366 if (!found) {
367 if (num_final_segments != s1) {
368 enc->dqm_[num_final_segments] = enc->dqm_[s1];
369 }
370 ++num_final_segments;
371 }
372 }
373 if (num_final_segments < num_segments) { // Remap
374 int i = enc->mb_w_ * enc->mb_h_;
375 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
376 enc->segment_hdr_.num_segments_ = num_final_segments;
377 // Replicate the trailing segment infos (it's mostly cosmetics)
378 for (i = num_final_segments; i < num_segments; ++i) {
379 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
380 }
381 }
382 }
383
VP8SetSegmentParams(VP8Encoder * const enc,float quality)384 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
385 int i;
386 int dq_uv_ac, dq_uv_dc;
387 const int num_segments = enc->segment_hdr_.num_segments_;
388 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
389 const double Q = quality / 100.;
390 const double c_base = enc->config_->emulate_jpeg_size ?
391 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
392 QualityToCompression(Q);
393 for (i = 0; i < num_segments; ++i) {
394 // We modulate the base coefficient to accommodate for the quantization
395 // susceptibility and allow denser segments to be quantized more.
396 const double expn = 1. - amp * enc->dqm_[i].alpha_;
397 const double c = pow(c_base, expn);
398 const int q = (int)(127. * (1. - c));
399 assert(expn > 0.);
400 enc->dqm_[i].quant_ = clip(q, 0, 127);
401 }
402
403 // purely indicative in the bitstream (except for the 1-segment case)
404 enc->base_quant_ = enc->dqm_[0].quant_;
405
406 // fill-in values for the unused segments (required by the syntax)
407 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
408 enc->dqm_[i].quant_ = enc->base_quant_;
409 }
410
411 // uv_alpha_ is normally spread around ~60. The useful range is
412 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
413 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
414 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
415 / (MAX_ALPHA - MIN_ALPHA);
416 // we rescale by the user-defined strength of adaptation
417 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
418 // and make it safe.
419 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
420 // We also boost the dc-uv-quant a little, based on sns-strength, since
421 // U/V channels are quite more reactive to high quants (flat DC-blocks
422 // tend to appear, and are unpleasant).
423 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
424 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
425
426 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
427 enc->dq_y2_dc_ = 0;
428 enc->dq_y2_ac_ = 0;
429 enc->dq_uv_dc_ = dq_uv_dc;
430 enc->dq_uv_ac_ = dq_uv_ac;
431
432 SetupFilterStrength(enc); // initialize segments' filtering, eventually
433
434 if (num_segments > 1) SimplifySegments(enc);
435
436 SetupMatrices(enc); // finalize quantization matrices
437 }
438
439 //------------------------------------------------------------------------------
440 // Form the predictions in cache
441
442 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
443 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
444 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
445
446 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
447 const int VP8I4ModeOffsets[NUM_BMODES] = {
448 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
449 };
450
VP8MakeLuma16Preds(const VP8EncIterator * const it)451 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
452 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
453 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
454 VP8EncPredLuma16(it->yuv_p_, left, top);
455 }
456
VP8MakeChroma8Preds(const VP8EncIterator * const it)457 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
458 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
459 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
460 VP8EncPredChroma8(it->yuv_p_, left, top);
461 }
462
VP8MakeIntra4Preds(const VP8EncIterator * const it)463 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
464 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
465 }
466
467 //------------------------------------------------------------------------------
468 // Quantize
469
470 // Layout:
471 // +----+----+
472 // |YYYY|UUVV| 0
473 // |YYYY|UUVV| 4
474 // |YYYY|....| 8
475 // |YYYY|....| 12
476 // +----+----+
477
478 const int VP8Scan[16] = { // Luma
479 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
480 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
481 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
482 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
483 };
484
485 static const int VP8ScanUV[4 + 4] = {
486 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
487 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
488 };
489
490 //------------------------------------------------------------------------------
491 // Distortion measurement
492
493 static const uint16_t kWeightY[16] = {
494 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
495 };
496
497 static const uint16_t kWeightTrellis[16] = {
498 #if USE_TDISTO == 0
499 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
500 #else
501 30, 27, 19, 11,
502 27, 24, 17, 10,
503 19, 17, 12, 8,
504 11, 10, 8, 6
505 #endif
506 };
507
508 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)509 static void InitScore(VP8ModeScore* const rd) {
510 rd->D = 0;
511 rd->SD = 0;
512 rd->R = 0;
513 rd->H = 0;
514 rd->nz = 0;
515 rd->score = MAX_COST;
516 }
517
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)518 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
519 dst->D = src->D;
520 dst->SD = src->SD;
521 dst->R = src->R;
522 dst->H = src->H;
523 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
524 dst->score = src->score;
525 }
526
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)527 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
528 dst->D += src->D;
529 dst->SD += src->SD;
530 dst->R += src->R;
531 dst->H += src->H;
532 dst->nz |= src->nz; // here, new nz bits are accumulated.
533 dst->score += src->score;
534 }
535
536 //------------------------------------------------------------------------------
537 // Performs trellis-optimized quantization.
538
539 // Trellis node
540 typedef struct {
541 int8_t prev; // best previous node
542 int8_t sign; // sign of coeff_i
543 int16_t level; // level
544 } Node;
545
546 // Score state
547 typedef struct {
548 score_t score; // partial RD score
549 const uint16_t* costs; // shortcut to cost tables
550 } ScoreState;
551
552 // If a coefficient was quantized to a value Q (using a neutral bias),
553 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
554 // We don't test negative values though.
555 #define MIN_DELTA 0 // how much lower level to try
556 #define MAX_DELTA 1 // how much higher
557 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
558 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
559 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
560
SetRDScore(int lambda,VP8ModeScore * const rd)561 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
562 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
563 }
564
RDScoreTrellis(int lambda,score_t rate,score_t distortion)565 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
566 score_t distortion) {
567 return rate * lambda + RD_DISTO_MULT * distortion;
568 }
569
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)570 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
571 int16_t in[16], int16_t out[16],
572 int ctx0, int coeff_type,
573 const VP8Matrix* const mtx,
574 int lambda) {
575 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
576 CostArrayPtr const costs =
577 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
578 const int first = (coeff_type == 0) ? 1 : 0;
579 Node nodes[16][NUM_NODES];
580 ScoreState score_states[2][NUM_NODES];
581 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
582 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
583 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
584 score_t best_score;
585 int n, m, p, last;
586
587 {
588 score_t cost;
589 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
590 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
591
592 // compute the position of the last interesting coefficient
593 last = first - 1;
594 for (n = 15; n >= first; --n) {
595 const int j = kZigzag[n];
596 const int err = in[j] * in[j];
597 if (err > thresh) {
598 last = n;
599 break;
600 }
601 }
602 // we don't need to go inspect up to n = 16 coeffs. We can just go up
603 // to last + 1 (inclusive) without losing much.
604 if (last < 15) ++last;
605
606 // compute 'skip' score. This is the max score one can do.
607 cost = VP8BitCost(0, last_proba);
608 best_score = RDScoreTrellis(lambda, cost, 0);
609
610 // initialize source node.
611 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
612 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
613 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
614 ss_cur[m].costs = costs[first][ctx0];
615 }
616 }
617
618 // traverse trellis.
619 for (n = first; n <= last; ++n) {
620 const int j = kZigzag[n];
621 const uint32_t Q = mtx->q_[j];
622 const uint32_t iQ = mtx->iq_[j];
623 const uint32_t B = BIAS(0x00); // neutral bias
624 // note: it's important to take sign of the _original_ coeff,
625 // so we don't have to consider level < 0 afterward.
626 const int sign = (in[j] < 0);
627 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
628 int level0 = QUANTDIV(coeff0, iQ, B);
629 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
630
631 { // Swap current and previous score states
632 ScoreState* const tmp = ss_cur;
633 ss_cur = ss_prev;
634 ss_prev = tmp;
635 }
636
637 // test all alternate level values around level0.
638 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
639 Node* const cur = &NODE(n, m);
640 int level = level0 + m;
641 const int ctx = (level > 2) ? 2 : level;
642 const int band = VP8EncBands[n + 1];
643 score_t base_score, last_pos_score;
644 score_t best_cur_score = MAX_COST;
645 int best_prev = 0; // default, in case
646
647 ss_cur[m].score = MAX_COST;
648 ss_cur[m].costs = costs[n + 1][ctx];
649 if (level > MAX_LEVEL || level < 0) { // node is dead?
650 continue;
651 }
652
653 // Compute extra rate cost if last coeff's position is < 15
654 {
655 const score_t last_pos_cost =
656 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
657 last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
658 }
659
660 {
661 // Compute delta_error = how much coding this level will
662 // subtract to max_error as distortion.
663 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
664 const int new_error = coeff0 - level * Q;
665 const int delta_error =
666 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
667 base_score = RDScoreTrellis(lambda, 0, delta_error);
668 }
669
670 // Inspect all possible non-dead predecessors. Retain only the best one.
671 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
672 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
673 // eliminated since their score can't be better than the current best.
674 const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
675 // Examine node assuming it's a non-terminal one.
676 const score_t score =
677 base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
678 if (score < best_cur_score) {
679 best_cur_score = score;
680 best_prev = p;
681 }
682 }
683 // Store best finding in current node.
684 cur->sign = sign;
685 cur->level = level;
686 cur->prev = best_prev;
687 ss_cur[m].score = best_cur_score;
688
689 // Now, record best terminal node (and thus best entry in the graph).
690 if (level != 0) {
691 const score_t score = best_cur_score + last_pos_score;
692 if (score < best_score) {
693 best_score = score;
694 best_path[0] = n; // best eob position
695 best_path[1] = m; // best node index
696 best_path[2] = best_prev; // best predecessor
697 }
698 }
699 }
700 }
701
702 // Fresh start
703 memset(in + first, 0, (16 - first) * sizeof(*in));
704 memset(out + first, 0, (16 - first) * sizeof(*out));
705 if (best_path[0] == -1) {
706 return 0; // skip!
707 }
708
709 {
710 // Unwind the best path.
711 // Note: best-prev on terminal node is not necessarily equal to the
712 // best_prev for non-terminal. So we patch best_path[2] in.
713 int nz = 0;
714 int best_node = best_path[1];
715 n = best_path[0];
716 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
717
718 for (; n >= first; --n) {
719 const Node* const node = &NODE(n, best_node);
720 const int j = kZigzag[n];
721 out[n] = node->sign ? -node->level : node->level;
722 nz |= node->level;
723 in[j] = out[n] * mtx->q_[j];
724 best_node = node->prev;
725 }
726 return (nz != 0);
727 }
728 }
729
730 #undef NODE
731
732 //------------------------------------------------------------------------------
733 // Performs: difference, transform, quantize, back-transform, add
734 // all at once. Output is the reconstructed block in *yuv_out, and the
735 // quantized levels in *levels.
736
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)737 static int ReconstructIntra16(VP8EncIterator* const it,
738 VP8ModeScore* const rd,
739 uint8_t* const yuv_out,
740 int mode) {
741 const VP8Encoder* const enc = it->enc_;
742 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
743 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
744 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
745 int nz = 0;
746 int n;
747 int16_t tmp[16][16], dc_tmp[16];
748
749 for (n = 0; n < 16; n += 2) {
750 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
751 }
752 VP8FTransformWHT(tmp[0], dc_tmp);
753 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
754
755 if (DO_TRELLIS_I16 && it->do_trellis_) {
756 int x, y;
757 VP8IteratorNzToBytes(it);
758 for (y = 0, n = 0; y < 4; ++y) {
759 for (x = 0; x < 4; ++x, ++n) {
760 const int ctx = it->top_nz_[x] + it->left_nz_[y];
761 const int non_zero =
762 TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
763 &dqm->y1_, dqm->lambda_trellis_i16_);
764 it->top_nz_[x] = it->left_nz_[y] = non_zero;
765 rd->y_ac_levels[n][0] = 0;
766 nz |= non_zero << n;
767 }
768 }
769 } else {
770 for (n = 0; n < 16; n += 2) {
771 // Zero-out the first coeff, so that: a) nz is correct below, and
772 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
773 tmp[n][0] = tmp[n + 1][0] = 0;
774 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
775 assert(rd->y_ac_levels[n + 0][0] == 0);
776 assert(rd->y_ac_levels[n + 1][0] == 0);
777 }
778 }
779
780 // Transform back
781 VP8TransformWHT(dc_tmp, tmp[0]);
782 for (n = 0; n < 16; n += 2) {
783 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
784 }
785
786 return nz;
787 }
788
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)789 static int ReconstructIntra4(VP8EncIterator* const it,
790 int16_t levels[16],
791 const uint8_t* const src,
792 uint8_t* const yuv_out,
793 int mode) {
794 const VP8Encoder* const enc = it->enc_;
795 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
796 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
797 int nz = 0;
798 int16_t tmp[16];
799
800 VP8FTransform(src, ref, tmp);
801 if (DO_TRELLIS_I4 && it->do_trellis_) {
802 const int x = it->i4_ & 3, y = it->i4_ >> 2;
803 const int ctx = it->top_nz_[x] + it->left_nz_[y];
804 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
805 dqm->lambda_trellis_i4_);
806 } else {
807 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
808 }
809 VP8ITransform(ref, tmp, yuv_out, 0);
810 return nz;
811 }
812
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)813 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
814 uint8_t* const yuv_out, int mode) {
815 const VP8Encoder* const enc = it->enc_;
816 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
817 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
818 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
819 int nz = 0;
820 int n;
821 int16_t tmp[8][16];
822
823 for (n = 0; n < 8; n += 2) {
824 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
825 }
826 if (DO_TRELLIS_UV && it->do_trellis_) {
827 int ch, x, y;
828 for (ch = 0, n = 0; ch <= 2; ch += 2) {
829 for (y = 0; y < 2; ++y) {
830 for (x = 0; x < 2; ++x, ++n) {
831 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
832 const int non_zero =
833 TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
834 &dqm->uv_, dqm->lambda_trellis_uv_);
835 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
836 nz |= non_zero << n;
837 }
838 }
839 }
840 } else {
841 for (n = 0; n < 8; n += 2) {
842 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
843 }
844 }
845
846 for (n = 0; n < 8; n += 2) {
847 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
848 }
849 return (nz << 16);
850 }
851
852 //------------------------------------------------------------------------------
853 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
854 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
855
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])856 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
857 // We look at the first three AC coefficients to determine what is the average
858 // delta between each sub-4x4 block.
859 const int v0 = abs(DCs[1]);
860 const int v1 = abs(DCs[4]);
861 const int v2 = abs(DCs[5]);
862 int max_v = (v0 > v1) ? v1 : v0;
863 max_v = (v2 > max_v) ? v2 : max_v;
864 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
865 }
866
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)867 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
868 VP8ModeScore* const tmp = *a;
869 *a = *b;
870 *b = tmp;
871 }
872
SwapPtr(uint8_t ** a,uint8_t ** b)873 static void SwapPtr(uint8_t** a, uint8_t** b) {
874 uint8_t* const tmp = *a;
875 *a = *b;
876 *b = tmp;
877 }
878
SwapOut(VP8EncIterator * const it)879 static void SwapOut(VP8EncIterator* const it) {
880 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
881 }
882
IsFlat(const int16_t * levels,int num_blocks,score_t thresh)883 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
884 score_t score = 0;
885 while (num_blocks-- > 0) { // TODO(skal): refine positional scoring?
886 int i;
887 for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC
888 score += (levels[i] != 0);
889 if (score > thresh) return 0;
890 }
891 levels += 16;
892 }
893 return 1;
894 }
895
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)896 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
897 const int kNumBlocks = 16;
898 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
899 const int lambda = dqm->lambda_i16_;
900 const int tlambda = dqm->tlambda_;
901 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
902 VP8ModeScore rd_tmp;
903 VP8ModeScore* rd_cur = &rd_tmp;
904 VP8ModeScore* rd_best = rd;
905 int mode;
906
907 rd->mode_i16 = -1;
908 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
909 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
910 rd_cur->mode_i16 = mode;
911
912 // Reconstruct
913 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
914
915 // Measure RD-score
916 rd_cur->D = VP8SSE16x16(src, tmp_dst);
917 rd_cur->SD =
918 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
919 rd_cur->H = VP8FixedCostsI16[mode];
920 rd_cur->R = VP8GetCostLuma16(it, rd_cur);
921 if (mode > 0 &&
922 IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
923 // penalty to avoid flat area to be mispredicted by complex mode
924 rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
925 }
926
927 // Since we always examine Intra16 first, we can overwrite *rd directly.
928 SetRDScore(lambda, rd_cur);
929 if (mode == 0 || rd_cur->score < rd_best->score) {
930 SwapModeScore(&rd_cur, &rd_best);
931 SwapOut(it);
932 }
933 }
934 if (rd_best != rd) {
935 memcpy(rd, rd_best, sizeof(*rd));
936 }
937 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
938 VP8SetIntra16Mode(it, rd->mode_i16);
939
940 // we have a blocky macroblock (only DCs are non-zero) with fairly high
941 // distortion, record max delta so we can later adjust the minimal filtering
942 // strength needed to smooth these blocks out.
943 if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
944 StoreMaxDelta(dqm, rd->y_dc_levels);
945 }
946 }
947
948 //------------------------------------------------------------------------------
949
950 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])951 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
952 const uint8_t modes[16]) {
953 const int preds_w = it->enc_->preds_w_;
954 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
955 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
956 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
957 return VP8FixedCostsI4[top][left];
958 }
959
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)960 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
961 const VP8Encoder* const enc = it->enc_;
962 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
963 const int lambda = dqm->lambda_i4_;
964 const int tlambda = dqm->tlambda_;
965 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
966 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
967 int total_header_bits = 0;
968 VP8ModeScore rd_best;
969
970 if (enc->max_i4_header_bits_ == 0) {
971 return 0;
972 }
973
974 InitScore(&rd_best);
975 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
976 SetRDScore(dqm->lambda_mode_, &rd_best);
977 VP8IteratorStartI4(it);
978 do {
979 const int kNumBlocks = 1;
980 VP8ModeScore rd_i4;
981 int mode;
982 int best_mode = -1;
983 const uint8_t* const src = src0 + VP8Scan[it->i4_];
984 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
985 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
986 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
987
988 InitScore(&rd_i4);
989 VP8MakeIntra4Preds(it);
990 for (mode = 0; mode < NUM_BMODES; ++mode) {
991 VP8ModeScore rd_tmp;
992 int16_t tmp_levels[16];
993
994 // Reconstruct
995 rd_tmp.nz =
996 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
997
998 // Compute RD-score
999 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1000 rd_tmp.SD =
1001 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1002 : 0;
1003 rd_tmp.H = mode_costs[mode];
1004
1005 // Add flatness penalty
1006 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1007 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1008 } else {
1009 rd_tmp.R = 0;
1010 }
1011
1012 // early-out check
1013 SetRDScore(lambda, &rd_tmp);
1014 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1015
1016 // finish computing score
1017 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1018 SetRDScore(lambda, &rd_tmp);
1019
1020 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1021 CopyScore(&rd_i4, &rd_tmp);
1022 best_mode = mode;
1023 SwapPtr(&tmp_dst, &best_block);
1024 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1025 sizeof(rd_best.y_ac_levels[it->i4_]));
1026 }
1027 }
1028 SetRDScore(dqm->lambda_mode_, &rd_i4);
1029 AddScore(&rd_best, &rd_i4);
1030 if (rd_best.score >= rd->score) {
1031 return 0;
1032 }
1033 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1034 if (total_header_bits > enc->max_i4_header_bits_) {
1035 return 0;
1036 }
1037 // Copy selected samples if not in the right place already.
1038 if (best_block != best_blocks + VP8Scan[it->i4_]) {
1039 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1040 }
1041 rd->modes_i4[it->i4_] = best_mode;
1042 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1043 } while (VP8IteratorRotateI4(it, best_blocks));
1044
1045 // finalize state
1046 CopyScore(rd, &rd_best);
1047 VP8SetIntra4Mode(it, rd->modes_i4);
1048 SwapOut(it);
1049 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1050 return 1; // select intra4x4 over intra16x16
1051 }
1052
1053 //------------------------------------------------------------------------------
1054
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1055 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1056 const int kNumBlocks = 8;
1057 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1058 const int lambda = dqm->lambda_uv_;
1059 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1060 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1061 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1062 uint8_t* dst = dst0;
1063 VP8ModeScore rd_best;
1064 int mode;
1065
1066 rd->mode_uv = -1;
1067 InitScore(&rd_best);
1068 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1069 VP8ModeScore rd_uv;
1070
1071 // Reconstruct
1072 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1073
1074 // Compute RD-score
1075 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1076 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1077 rd_uv.H = VP8FixedCostsUV[mode];
1078 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1079 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1080 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1081 }
1082
1083 SetRDScore(lambda, &rd_uv);
1084 if (mode == 0 || rd_uv.score < rd_best.score) {
1085 CopyScore(&rd_best, &rd_uv);
1086 rd->mode_uv = mode;
1087 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1088 SwapPtr(&dst, &tmp_dst);
1089 }
1090 }
1091 VP8SetIntraUVMode(it, rd->mode_uv);
1092 AddScore(rd, &rd_best);
1093 if (dst != dst0) { // copy 16x8 block if needed
1094 VP8Copy16x8(dst, dst0);
1095 }
1096 }
1097
1098 //------------------------------------------------------------------------------
1099 // Final reconstruction and quantization.
1100
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1101 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1102 const VP8Encoder* const enc = it->enc_;
1103 const int is_i16 = (it->mb_->type_ == 1);
1104 int nz = 0;
1105
1106 if (is_i16) {
1107 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1108 } else {
1109 VP8IteratorStartI4(it);
1110 do {
1111 const int mode =
1112 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1113 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1114 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1115 VP8MakeIntra4Preds(it);
1116 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1117 src, dst, mode) << it->i4_;
1118 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1119 }
1120
1121 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1122 rd->nz = nz;
1123 }
1124
1125 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1126 static void RefineUsingDistortion(VP8EncIterator* const it,
1127 int try_both_modes, int refine_uv_mode,
1128 VP8ModeScore* const rd) {
1129 score_t best_score = MAX_COST;
1130 score_t score_i4 = (score_t)I4_PENALTY;
1131 int16_t tmp_levels[16][16];
1132 uint8_t modes_i4[16];
1133 int nz = 0;
1134 int mode;
1135 int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1136
1137 if (is_i16) { // First, evaluate Intra16 distortion
1138 int best_mode = -1;
1139 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1140 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1141 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1142 const score_t score = VP8SSE16x16(src, ref);
1143 if (score < best_score) {
1144 best_mode = mode;
1145 best_score = score;
1146 }
1147 }
1148 VP8SetIntra16Mode(it, best_mode);
1149 // we'll reconstruct later, if i16 mode actually gets selected
1150 }
1151
1152 // Next, evaluate Intra4
1153 if (try_both_modes || !is_i16) {
1154 // We don't evaluate the rate here, but just account for it through a
1155 // constant penalty (i4 mode usually needs more bits compared to i16).
1156 is_i16 = 0;
1157 VP8IteratorStartI4(it);
1158 do {
1159 int best_i4_mode = -1;
1160 score_t best_i4_score = MAX_COST;
1161 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1162
1163 VP8MakeIntra4Preds(it);
1164 for (mode = 0; mode < NUM_BMODES; ++mode) {
1165 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1166 const score_t score = VP8SSE4x4(src, ref);
1167 if (score < best_i4_score) {
1168 best_i4_mode = mode;
1169 best_i4_score = score;
1170 }
1171 }
1172 modes_i4[it->i4_] = best_i4_mode;
1173 score_i4 += best_i4_score;
1174 if (score_i4 >= best_score) {
1175 // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1176 is_i16 = 1;
1177 break;
1178 } else { // reconstruct partial block inside yuv_out2_ buffer
1179 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1180 nz |= ReconstructIntra4(it, tmp_levels[it->i4_],
1181 src, tmp_dst, best_i4_mode) << it->i4_;
1182 }
1183 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1184 }
1185
1186 // Final reconstruction, depending on which mode is selected.
1187 if (!is_i16) {
1188 VP8SetIntra4Mode(it, modes_i4);
1189 memcpy(rd->y_ac_levels, tmp_levels, sizeof(tmp_levels));
1190 SwapOut(it);
1191 best_score = score_i4;
1192 } else {
1193 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1194 }
1195
1196 // ... and UV!
1197 if (refine_uv_mode) {
1198 int best_mode = -1;
1199 score_t best_uv_score = MAX_COST;
1200 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1201 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1202 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1203 const score_t score = VP8SSE16x8(src, ref);
1204 if (score < best_uv_score) {
1205 best_mode = mode;
1206 best_uv_score = score;
1207 }
1208 }
1209 VP8SetIntraUVMode(it, best_mode);
1210 }
1211 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1212
1213 rd->nz = nz;
1214 rd->score = best_score;
1215 }
1216
1217 //------------------------------------------------------------------------------
1218 // Entry point
1219
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1220 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1221 VP8RDLevel rd_opt) {
1222 int is_skipped;
1223 const int method = it->enc_->method_;
1224
1225 InitScore(rd);
1226
1227 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1228 // Luma4x4 predictions needs to be done as-we-go.
1229 VP8MakeLuma16Preds(it);
1230 VP8MakeChroma8Preds(it);
1231
1232 if (rd_opt > RD_OPT_NONE) {
1233 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1234 PickBestIntra16(it, rd);
1235 if (method >= 2) {
1236 PickBestIntra4(it, rd);
1237 }
1238 PickBestUV(it, rd);
1239 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1240 it->do_trellis_ = 1;
1241 SimpleQuantize(it, rd);
1242 }
1243 } else {
1244 // At this point we have heuristically decided intra16 / intra4.
1245 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1246 // For method <= 1, we don't re-examine the decision but just go ahead with
1247 // quantization/reconstruction.
1248 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1249 }
1250 is_skipped = (rd->nz == 0);
1251 VP8SetSkip(it, is_skipped);
1252 return is_skipped;
1253 }
1254