1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h> // for abs()
17
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21
22 #define DO_TRELLIS_I4 1
23 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26
27 #define MID_ALPHA 64 // neutral value for susceptibility
28 #define MIN_ALPHA 30 // lowest usable value for susceptibility
29 #define MAX_ALPHA 100 // higher meaningful value for susceptibility
30
31 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
32 // power-law modulation. Must be strictly less than 1.
33
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
37 #define FLATNESS_LIMIT_I4 3 // I4 mode
38 #define FLATNESS_LIMIT_UV 2 // UV mode
39 #define FLATNESS_PENALTY 140 // roughly ~1bit per block
40
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43 #define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
44
45 // #define DEBUG_BLOCK
46
47 //------------------------------------------------------------------------------
48
49 #if defined(DEBUG_BLOCK)
50
51 #include <stdio.h>
52 #include <stdlib.h>
53
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55 const VP8ModeScore* const rd) {
56 int i, j;
57 const int is_i16 = (it->mb_->type_ == 1);
58 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62 printf("SOURCE / OUTPUT / ABS DELTA\n");
63 for (j = 0; j < 16; ++j) {
64 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65 printf(" ");
66 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67 printf(" ");
68 for (i = 0; i < 16; ++i) {
69 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70 }
71 printf("\n");
72 }
73 printf("\n"); // newline before the U/V block
74 for (j = 0; j < 8; ++j) {
75 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76 printf(" ");
77 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78 printf(" ");
79 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80 printf(" ");
81 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82 printf(" ");
83 for (i = 0; i < 8; ++i) {
84 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85 }
86 printf(" ");
87 for (i = 8; i < 16; ++i) {
88 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89 }
90 printf("\n");
91 }
92 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94 (int)rd->score);
95 if (is_i16) {
96 printf("Mode: %d\n", rd->mode_i16);
97 printf("y_dc_levels:");
98 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99 printf("\n");
100 } else {
101 printf("Modes[16]: ");
102 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103 printf("\n");
104 }
105 printf("y_ac_levels:\n");
106 for (j = 0; j < 16; ++j) {
107 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108 printf("%4d ", rd->y_ac_levels[j][i]);
109 }
110 printf("\n");
111 }
112 printf("\n");
113 printf("uv_levels (mode=%d):\n", rd->mode_uv);
114 for (j = 0; j < 8; ++j) {
115 for (i = 0; i < 16; ++i) {
116 printf("%4d ", rd->uv_levels[j][i]);
117 }
118 printf("\n");
119 }
120 }
121
122 #endif // DEBUG_BLOCK
123
124 //------------------------------------------------------------------------------
125
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127 return v < m ? m : v > M ? M : v;
128 }
129
130 static const uint8_t kZigzag[16] = {
131 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133
134 static const uint8_t kDcTable[128] = {
135 4, 5, 6, 7, 8, 9, 10, 10,
136 11, 12, 13, 14, 15, 16, 17, 17,
137 18, 19, 20, 20, 21, 21, 22, 22,
138 23, 23, 24, 25, 25, 26, 27, 28,
139 29, 30, 31, 32, 33, 34, 35, 36,
140 37, 37, 38, 39, 40, 41, 42, 43,
141 44, 45, 46, 46, 47, 48, 49, 50,
142 51, 52, 53, 54, 55, 56, 57, 58,
143 59, 60, 61, 62, 63, 64, 65, 66,
144 67, 68, 69, 70, 71, 72, 73, 74,
145 75, 76, 76, 77, 78, 79, 80, 81,
146 82, 83, 84, 85, 86, 87, 88, 89,
147 91, 93, 95, 96, 98, 100, 101, 102,
148 104, 106, 108, 110, 112, 114, 116, 118,
149 122, 124, 126, 128, 130, 132, 134, 136,
150 138, 140, 143, 145, 148, 151, 154, 157
151 };
152
153 static const uint16_t kAcTable[128] = {
154 4, 5, 6, 7, 8, 9, 10, 11,
155 12, 13, 14, 15, 16, 17, 18, 19,
156 20, 21, 22, 23, 24, 25, 26, 27,
157 28, 29, 30, 31, 32, 33, 34, 35,
158 36, 37, 38, 39, 40, 41, 42, 43,
159 44, 45, 46, 47, 48, 49, 50, 51,
160 52, 53, 54, 55, 56, 57, 58, 60,
161 62, 64, 66, 68, 70, 72, 74, 76,
162 78, 80, 82, 84, 86, 88, 90, 92,
163 94, 96, 98, 100, 102, 104, 106, 108,
164 110, 112, 114, 116, 119, 122, 125, 128,
165 131, 134, 137, 140, 143, 146, 149, 152,
166 155, 158, 161, 164, 167, 170, 173, 177,
167 181, 185, 189, 193, 197, 201, 205, 209,
168 213, 217, 221, 225, 229, 234, 239, 245,
169 249, 254, 259, 264, 269, 274, 279, 284
170 };
171
172 static const uint16_t kAcTable2[128] = {
173 8, 8, 9, 10, 12, 13, 15, 17,
174 18, 20, 21, 23, 24, 26, 27, 29,
175 31, 32, 34, 35, 37, 38, 40, 41,
176 43, 44, 46, 48, 49, 51, 52, 54,
177 55, 57, 58, 60, 62, 63, 65, 66,
178 68, 69, 71, 72, 74, 75, 77, 79,
179 80, 82, 83, 85, 86, 88, 89, 93,
180 96, 99, 102, 105, 108, 111, 114, 117,
181 120, 124, 127, 130, 133, 136, 139, 142,
182 145, 148, 151, 155, 158, 161, 164, 167,
183 170, 173, 176, 179, 184, 189, 193, 198,
184 203, 207, 212, 217, 221, 226, 230, 235,
185 240, 244, 249, 254, 258, 263, 268, 274,
186 280, 286, 292, 299, 305, 311, 317, 323,
187 330, 336, 342, 348, 354, 362, 370, 379,
188 385, 393, 401, 409, 416, 424, 432, 440
189 };
190
191 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
192 { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199 0, 30, 60, 90,
200 30, 60, 90, 90,
201 60, 90, 90, 90,
202 90, 90, 90, 90
203 };
204
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210 int i, sum;
211 for (i = 0; i < 2; ++i) {
212 const int is_ac_coeff = (i > 0);
213 const int bias = kBiasMatrices[type][is_ac_coeff];
214 m->iq_[i] = (1 << QFIX) / m->q_[i];
215 m->bias_[i] = BIAS(bias);
216 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217 // * zero if coeff <= zthresh
218 // * non-zero if coeff > zthresh
219 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220 }
221 for (i = 2; i < 16; ++i) {
222 m->q_[i] = m->q_[1];
223 m->iq_[i] = m->iq_[1];
224 m->bias_[i] = m->bias_[1];
225 m->zthresh_[i] = m->zthresh_[1];
226 }
227 for (sum = 0, i = 0; i < 16; ++i) {
228 if (type == 0) { // we only use sharpening for AC luma coeffs
229 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230 } else {
231 m->sharpen_[i] = 0;
232 }
233 sum += m->q_[i];
234 }
235 return (sum + 8) >> 4;
236 }
237
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241 int i;
242 const int tlambda_scale =
243 (enc->method_ >= 4) ? enc->config_->sns_strength
244 : 0;
245 const int num_segments = enc->segment_hdr_.num_segments_;
246 for (i = 0; i < num_segments; ++i) {
247 VP8SegmentInfo* const m = &enc->dqm_[i];
248 const int q = m->quant_;
249 int q_i4, q_i16, q_uv;
250 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
252
253 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255
256 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258
259 q_i4 = ExpandMatrix(&m->y1_, 0);
260 q_i16 = ExpandMatrix(&m->y2_, 1);
261 q_uv = ExpandMatrix(&m->uv_, 2);
262
263 m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
264 m->lambda_i16_ = (3 * q_i16 * q_i16);
265 m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
266 m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
267 m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
268 m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269 m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
270 m->tlambda_ = (tlambda_scale * q_i4) >> 5;
271
272 // none of these constants should be < 1
273 CheckLambdaValue(&m->lambda_i4_);
274 CheckLambdaValue(&m->lambda_i16_);
275 CheckLambdaValue(&m->lambda_uv_);
276 CheckLambdaValue(&m->lambda_mode_);
277 CheckLambdaValue(&m->lambda_trellis_i4_);
278 CheckLambdaValue(&m->lambda_trellis_i16_);
279 CheckLambdaValue(&m->lambda_trellis_uv_);
280 CheckLambdaValue(&m->tlambda_);
281
282 m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
283 m->max_edge_ = 0;
284
285 m->i4_penalty_ = 1000 * q_i4 * q_i4;
286 }
287 }
288
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297 int i;
298 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299 const int level0 = 5 * enc->config_->filter_strength;
300 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301 VP8SegmentInfo* const m = &enc->dqm_[i];
302 // We focus on the quantization of AC coeffs.
303 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304 const int base_strength =
305 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306 // Segments with lower complexity ('beta') will be less filtered.
307 const int f = base_strength * level0 / (256 + m->beta_);
308 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309 }
310 // We record the initial strength (mainly for the case of 1-segment only).
311 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315
316 //------------------------------------------------------------------------------
317
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328 // The file size roughly scales as pow(quantizer, 3.). Actually, the
329 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330 // in the mid-quant range. So we scale the compressibility inversely to
331 // this power-law: quant ~= compression ^ 1/3. This law holds well for
332 // low quant. Finer modeling for high-quant would make use of kAcTable[]
333 // more explicitly.
334 const double v = pow(linear_c, 1 / 3.);
335 return v;
336 }
337
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339 // We map the complexity 'alpha' and quality setting 'c' to a compression
340 // exponent empirically matched to the compression curve of libjpeg6b.
341 // On average, the WebP output size will be roughly similar to that of a
342 // JPEG file compressed with same quality factor.
343 const double amin = 0.30;
344 const double amax = 0.85;
345 const double exp_min = 0.4;
346 const double exp_max = 0.9;
347 const double slope = (exp_min - exp_max) / (amax - amin);
348 // Linearly interpolate 'expn' from exp_min to exp_max
349 // in the [amin, amax] range.
350 const double expn = (alpha > amax) ? exp_min
351 : (alpha < amin) ? exp_max
352 : exp_max + slope * (alpha - amin);
353 const double v = pow(c, expn);
354 return v;
355 }
356
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358 const VP8SegmentInfo* const S2) {
359 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364 // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365 // explicit check is needed to avoid a spurious warning about 'i' exceeding
366 // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367 const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368 ? enc->segment_hdr_.num_segments_
369 : NUM_MB_SEGMENTS;
370 int num_final_segments = 1;
371 int s1, s2;
372 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
373 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374 int found = 0;
375 // check if we already have similar segment
376 for (s2 = 0; s2 < num_final_segments; ++s2) {
377 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378 if (SegmentsAreEquivalent(S1, S2)) {
379 found = 1;
380 break;
381 }
382 }
383 map[s1] = s2;
384 if (!found) {
385 if (num_final_segments != s1) {
386 enc->dqm_[num_final_segments] = enc->dqm_[s1];
387 }
388 ++num_final_segments;
389 }
390 }
391 if (num_final_segments < num_segments) { // Remap
392 int i = enc->mb_w_ * enc->mb_h_;
393 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394 enc->segment_hdr_.num_segments_ = num_final_segments;
395 // Replicate the trailing segment infos (it's mostly cosmetics)
396 for (i = num_final_segments; i < num_segments; ++i) {
397 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398 }
399 }
400 }
401
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403 int i;
404 int dq_uv_ac, dq_uv_dc;
405 const int num_segments = enc->segment_hdr_.num_segments_;
406 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407 const double Q = quality / 100.;
408 const double c_base = enc->config_->emulate_jpeg_size ?
409 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410 QualityToCompression(Q);
411 for (i = 0; i < num_segments; ++i) {
412 // We modulate the base coefficient to accommodate for the quantization
413 // susceptibility and allow denser segments to be quantized more.
414 const double expn = 1. - amp * enc->dqm_[i].alpha_;
415 const double c = pow(c_base, expn);
416 const int q = (int)(127. * (1. - c));
417 assert(expn > 0.);
418 enc->dqm_[i].quant_ = clip(q, 0, 127);
419 }
420
421 // purely indicative in the bitstream (except for the 1-segment case)
422 enc->base_quant_ = enc->dqm_[0].quant_;
423
424 // fill-in values for the unused segments (required by the syntax)
425 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426 enc->dqm_[i].quant_ = enc->base_quant_;
427 }
428
429 // uv_alpha_ is normally spread around ~60. The useful range is
430 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433 / (MAX_ALPHA - MIN_ALPHA);
434 // we rescale by the user-defined strength of adaptation
435 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436 // and make it safe.
437 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438 // We also boost the dc-uv-quant a little, based on sns-strength, since
439 // U/V channels are quite more reactive to high quants (flat DC-blocks
440 // tend to appear, and are unpleasant).
441 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
443
444 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
445 enc->dq_y2_dc_ = 0;
446 enc->dq_y2_ac_ = 0;
447 enc->dq_uv_dc_ = dq_uv_dc;
448 enc->dq_uv_ac_ = dq_uv_ac;
449
450 SetupFilterStrength(enc); // initialize segments' filtering, eventually
451
452 if (num_segments > 1) SimplifySegments(enc);
453
454 SetupMatrices(enc); // finalize quantization matrices
455 }
456
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472 VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478 VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480
VP8MakeIntra4Preds(const VP8EncIterator * const it)481 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483 }
484
485 //------------------------------------------------------------------------------
486 // Quantize
487
488 // Layout:
489 // +----+----+
490 // |YYYY|UUVV| 0
491 // |YYYY|UUVV| 4
492 // |YYYY|....| 8
493 // |YYYY|....| 12
494 // +----+----+
495
496 const uint16_t VP8Scan[16] = { // Luma
497 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
498 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
499 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
500 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
501 };
502
503 static const uint16_t VP8ScanUV[4 + 4] = {
504 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
505 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
506 };
507
508 //------------------------------------------------------------------------------
509 // Distortion measurement
510
511 static const uint16_t kWeightY[16] = {
512 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
513 };
514
515 static const uint16_t kWeightTrellis[16] = {
516 #if USE_TDISTO == 0
517 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
518 #else
519 30, 27, 19, 11,
520 27, 24, 17, 10,
521 19, 17, 12, 8,
522 11, 10, 8, 6
523 #endif
524 };
525
526 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)527 static void InitScore(VP8ModeScore* const rd) {
528 rd->D = 0;
529 rd->SD = 0;
530 rd->R = 0;
531 rd->H = 0;
532 rd->nz = 0;
533 rd->score = MAX_COST;
534 }
535
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)536 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
537 dst->D = src->D;
538 dst->SD = src->SD;
539 dst->R = src->R;
540 dst->H = src->H;
541 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
542 dst->score = src->score;
543 }
544
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)545 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
546 dst->D += src->D;
547 dst->SD += src->SD;
548 dst->R += src->R;
549 dst->H += src->H;
550 dst->nz |= src->nz; // here, new nz bits are accumulated.
551 dst->score += src->score;
552 }
553
554 //------------------------------------------------------------------------------
555 // Performs trellis-optimized quantization.
556
557 // Trellis node
558 typedef struct {
559 int8_t prev; // best previous node
560 int8_t sign; // sign of coeff_i
561 int16_t level; // level
562 } Node;
563
564 // Score state
565 typedef struct {
566 score_t score; // partial RD score
567 const uint16_t* costs; // shortcut to cost tables
568 } ScoreState;
569
570 // If a coefficient was quantized to a value Q (using a neutral bias),
571 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
572 // We don't test negative values though.
573 #define MIN_DELTA 0 // how much lower level to try
574 #define MAX_DELTA 1 // how much higher
575 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
576 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
577 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
578
SetRDScore(int lambda,VP8ModeScore * const rd)579 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
580 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
581 }
582
RDScoreTrellis(int lambda,score_t rate,score_t distortion)583 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
584 score_t distortion) {
585 return rate * lambda + RD_DISTO_MULT * distortion;
586 }
587
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)588 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
589 int16_t in[16], int16_t out[16],
590 int ctx0, int coeff_type,
591 const VP8Matrix* const mtx,
592 int lambda) {
593 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
594 CostArrayPtr const costs =
595 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
596 const int first = (coeff_type == 0) ? 1 : 0;
597 Node nodes[16][NUM_NODES];
598 ScoreState score_states[2][NUM_NODES];
599 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
600 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
601 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
602 score_t best_score;
603 int n, m, p, last;
604
605 {
606 score_t cost;
607 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
608 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
609
610 // compute the position of the last interesting coefficient
611 last = first - 1;
612 for (n = 15; n >= first; --n) {
613 const int j = kZigzag[n];
614 const int err = in[j] * in[j];
615 if (err > thresh) {
616 last = n;
617 break;
618 }
619 }
620 // we don't need to go inspect up to n = 16 coeffs. We can just go up
621 // to last + 1 (inclusive) without losing much.
622 if (last < 15) ++last;
623
624 // compute 'skip' score. This is the max score one can do.
625 cost = VP8BitCost(0, last_proba);
626 best_score = RDScoreTrellis(lambda, cost, 0);
627
628 // initialize source node.
629 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
630 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
631 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
632 ss_cur[m].costs = costs[first][ctx0];
633 }
634 }
635
636 // traverse trellis.
637 for (n = first; n <= last; ++n) {
638 const int j = kZigzag[n];
639 const uint32_t Q = mtx->q_[j];
640 const uint32_t iQ = mtx->iq_[j];
641 const uint32_t B = BIAS(0x00); // neutral bias
642 // note: it's important to take sign of the _original_ coeff,
643 // so we don't have to consider level < 0 afterward.
644 const int sign = (in[j] < 0);
645 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
646 int level0 = QUANTDIV(coeff0, iQ, B);
647 int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
648 if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
649 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
650
651 { // Swap current and previous score states
652 ScoreState* const tmp = ss_cur;
653 ss_cur = ss_prev;
654 ss_prev = tmp;
655 }
656
657 // test all alternate level values around level0.
658 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
659 Node* const cur = &NODE(n, m);
660 int level = level0 + m;
661 const int ctx = (level > 2) ? 2 : level;
662 const int band = VP8EncBands[n + 1];
663 score_t base_score;
664 score_t best_cur_score = MAX_COST;
665 int best_prev = 0; // default, in case
666
667 ss_cur[m].score = MAX_COST;
668 ss_cur[m].costs = costs[n + 1][ctx];
669 if (level < 0 || level > thresh_level) {
670 // Node is dead.
671 continue;
672 }
673
674 {
675 // Compute delta_error = how much coding this level will
676 // subtract to max_error as distortion.
677 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
678 const int new_error = coeff0 - level * Q;
679 const int delta_error =
680 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
681 base_score = RDScoreTrellis(lambda, 0, delta_error);
682 }
683
684 // Inspect all possible non-dead predecessors. Retain only the best one.
685 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
686 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
687 // eliminated since their score can't be better than the current best.
688 const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
689 // Examine node assuming it's a non-terminal one.
690 const score_t score =
691 base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
692 if (score < best_cur_score) {
693 best_cur_score = score;
694 best_prev = p;
695 }
696 }
697 // Store best finding in current node.
698 cur->sign = sign;
699 cur->level = level;
700 cur->prev = best_prev;
701 ss_cur[m].score = best_cur_score;
702
703 // Now, record best terminal node (and thus best entry in the graph).
704 if (level != 0) {
705 const score_t last_pos_cost =
706 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
707 const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
708 const score_t score = best_cur_score + last_pos_score;
709 if (score < best_score) {
710 best_score = score;
711 best_path[0] = n; // best eob position
712 best_path[1] = m; // best node index
713 best_path[2] = best_prev; // best predecessor
714 }
715 }
716 }
717 }
718
719 // Fresh start
720 memset(in + first, 0, (16 - first) * sizeof(*in));
721 memset(out + first, 0, (16 - first) * sizeof(*out));
722 if (best_path[0] == -1) {
723 return 0; // skip!
724 }
725
726 {
727 // Unwind the best path.
728 // Note: best-prev on terminal node is not necessarily equal to the
729 // best_prev for non-terminal. So we patch best_path[2] in.
730 int nz = 0;
731 int best_node = best_path[1];
732 n = best_path[0];
733 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
734
735 for (; n >= first; --n) {
736 const Node* const node = &NODE(n, best_node);
737 const int j = kZigzag[n];
738 out[n] = node->sign ? -node->level : node->level;
739 nz |= node->level;
740 in[j] = out[n] * mtx->q_[j];
741 best_node = node->prev;
742 }
743 return (nz != 0);
744 }
745 }
746
747 #undef NODE
748
749 //------------------------------------------------------------------------------
750 // Performs: difference, transform, quantize, back-transform, add
751 // all at once. Output is the reconstructed block in *yuv_out, and the
752 // quantized levels in *levels.
753
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)754 static int ReconstructIntra16(VP8EncIterator* const it,
755 VP8ModeScore* const rd,
756 uint8_t* const yuv_out,
757 int mode) {
758 const VP8Encoder* const enc = it->enc_;
759 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
760 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
761 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
762 int nz = 0;
763 int n;
764 int16_t tmp[16][16], dc_tmp[16];
765
766 for (n = 0; n < 16; n += 2) {
767 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
768 }
769 VP8FTransformWHT(tmp[0], dc_tmp);
770 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
771
772 if (DO_TRELLIS_I16 && it->do_trellis_) {
773 int x, y;
774 VP8IteratorNzToBytes(it);
775 for (y = 0, n = 0; y < 4; ++y) {
776 for (x = 0; x < 4; ++x, ++n) {
777 const int ctx = it->top_nz_[x] + it->left_nz_[y];
778 const int non_zero =
779 TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
780 &dqm->y1_, dqm->lambda_trellis_i16_);
781 it->top_nz_[x] = it->left_nz_[y] = non_zero;
782 rd->y_ac_levels[n][0] = 0;
783 nz |= non_zero << n;
784 }
785 }
786 } else {
787 for (n = 0; n < 16; n += 2) {
788 // Zero-out the first coeff, so that: a) nz is correct below, and
789 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
790 tmp[n][0] = tmp[n + 1][0] = 0;
791 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
792 assert(rd->y_ac_levels[n + 0][0] == 0);
793 assert(rd->y_ac_levels[n + 1][0] == 0);
794 }
795 }
796
797 // Transform back
798 VP8TransformWHT(dc_tmp, tmp[0]);
799 for (n = 0; n < 16; n += 2) {
800 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
801 }
802
803 return nz;
804 }
805
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)806 static int ReconstructIntra4(VP8EncIterator* const it,
807 int16_t levels[16],
808 const uint8_t* const src,
809 uint8_t* const yuv_out,
810 int mode) {
811 const VP8Encoder* const enc = it->enc_;
812 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
813 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
814 int nz = 0;
815 int16_t tmp[16];
816
817 VP8FTransform(src, ref, tmp);
818 if (DO_TRELLIS_I4 && it->do_trellis_) {
819 const int x = it->i4_ & 3, y = it->i4_ >> 2;
820 const int ctx = it->top_nz_[x] + it->left_nz_[y];
821 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
822 dqm->lambda_trellis_i4_);
823 } else {
824 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
825 }
826 VP8ITransform(ref, tmp, yuv_out, 0);
827 return nz;
828 }
829
830 //------------------------------------------------------------------------------
831 // DC-error diffusion
832
833 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
834 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
835 #define C1 7 // fraction of error sent to the 4x4 block below
836 #define C2 8 // fraction of error sent to the 4x4 block on the right
837 #define DSHIFT 4
838 #define DSCALE 1 // storage descaling, needed to make the error fit int8_t
839
840 // Quantize as usual, but also compute and return the quantization error.
841 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * const v,const VP8Matrix * const mtx)842 static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
843 int V = *v;
844 const int sign = (V < 0);
845 if (sign) V = -V;
846 if (V > (int)mtx->zthresh_[0]) {
847 const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
848 const int err = (V - qV);
849 *v = sign ? -qV : qV;
850 return (sign ? -err : err) >> DSCALE;
851 }
852 *v = 0;
853 return (sign ? -V : V) >> DSCALE;
854 }
855
CorrectDCValues(const VP8EncIterator * const it,const VP8Matrix * const mtx,int16_t tmp[][16],VP8ModeScore * const rd)856 static void CorrectDCValues(const VP8EncIterator* const it,
857 const VP8Matrix* const mtx,
858 int16_t tmp[][16], VP8ModeScore* const rd) {
859 // | top[0] | top[1]
860 // --------+--------+---------
861 // left[0] | tmp[0] tmp[1] <-> err0 err1
862 // left[1] | tmp[2] tmp[3] err2 err3
863 //
864 // Final errors {err1,err2,err3} are preserved and later restored
865 // as top[]/left[] on the next block.
866 int ch;
867 for (ch = 0; ch <= 1; ++ch) {
868 const int8_t* const top = it->top_derr_[it->x_][ch];
869 const int8_t* const left = it->left_derr_[ch];
870 int16_t (* const c)[16] = &tmp[ch * 4];
871 int err0, err1, err2, err3;
872 c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
873 err0 = QuantizeSingle(&c[0][0], mtx);
874 c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
875 err1 = QuantizeSingle(&c[1][0], mtx);
876 c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
877 err2 = QuantizeSingle(&c[2][0], mtx);
878 c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
879 err3 = QuantizeSingle(&c[3][0], mtx);
880 // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
881 // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
882 assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
883 rd->derr[ch][0] = (int8_t)err1;
884 rd->derr[ch][1] = (int8_t)err2;
885 rd->derr[ch][2] = (int8_t)err3;
886 }
887 }
888
StoreDiffusionErrors(VP8EncIterator * const it,const VP8ModeScore * const rd)889 static void StoreDiffusionErrors(VP8EncIterator* const it,
890 const VP8ModeScore* const rd) {
891 int ch;
892 for (ch = 0; ch <= 1; ++ch) {
893 int8_t* const top = it->top_derr_[it->x_][ch];
894 int8_t* const left = it->left_derr_[ch];
895 left[0] = rd->derr[ch][0]; // restore err1
896 left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
897 top[0] = rd->derr[ch][1]; // ... err2
898 top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
899 }
900 }
901
902 #undef C1
903 #undef C2
904 #undef DSHIFT
905 #undef DSCALE
906
907 //------------------------------------------------------------------------------
908
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)909 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
910 uint8_t* const yuv_out, int mode) {
911 const VP8Encoder* const enc = it->enc_;
912 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
913 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
914 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
915 int nz = 0;
916 int n;
917 int16_t tmp[8][16];
918
919 for (n = 0; n < 8; n += 2) {
920 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
921 }
922 if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
923
924 if (DO_TRELLIS_UV && it->do_trellis_) {
925 int ch, x, y;
926 for (ch = 0, n = 0; ch <= 2; ch += 2) {
927 for (y = 0; y < 2; ++y) {
928 for (x = 0; x < 2; ++x, ++n) {
929 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
930 const int non_zero =
931 TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
932 &dqm->uv_, dqm->lambda_trellis_uv_);
933 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
934 nz |= non_zero << n;
935 }
936 }
937 }
938 } else {
939 for (n = 0; n < 8; n += 2) {
940 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
941 }
942 }
943
944 for (n = 0; n < 8; n += 2) {
945 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
946 }
947 return (nz << 16);
948 }
949
950 //------------------------------------------------------------------------------
951 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
952 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
953
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])954 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
955 // We look at the first three AC coefficients to determine what is the average
956 // delta between each sub-4x4 block.
957 const int v0 = abs(DCs[1]);
958 const int v1 = abs(DCs[2]);
959 const int v2 = abs(DCs[4]);
960 int max_v = (v1 > v0) ? v1 : v0;
961 max_v = (v2 > max_v) ? v2 : max_v;
962 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
963 }
964
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)965 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
966 VP8ModeScore* const tmp = *a;
967 *a = *b;
968 *b = tmp;
969 }
970
SwapPtr(uint8_t ** a,uint8_t ** b)971 static void SwapPtr(uint8_t** a, uint8_t** b) {
972 uint8_t* const tmp = *a;
973 *a = *b;
974 *b = tmp;
975 }
976
SwapOut(VP8EncIterator * const it)977 static void SwapOut(VP8EncIterator* const it) {
978 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
979 }
980
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)981 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
982 const int kNumBlocks = 16;
983 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
984 const int lambda = dqm->lambda_i16_;
985 const int tlambda = dqm->tlambda_;
986 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
987 VP8ModeScore rd_tmp;
988 VP8ModeScore* rd_cur = &rd_tmp;
989 VP8ModeScore* rd_best = rd;
990 int mode;
991 int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
992
993 rd->mode_i16 = -1;
994 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
995 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
996 rd_cur->mode_i16 = mode;
997
998 // Reconstruct
999 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1000
1001 // Measure RD-score
1002 rd_cur->D = VP8SSE16x16(src, tmp_dst);
1003 rd_cur->SD =
1004 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1005 rd_cur->H = VP8FixedCostsI16[mode];
1006 rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1007 if (is_flat) {
1008 // refine the first impression (which was in pixel space)
1009 is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1010 if (is_flat) {
1011 // Block is very flat. We put emphasis on the distortion being very low!
1012 rd_cur->D *= 2;
1013 rd_cur->SD *= 2;
1014 }
1015 }
1016
1017 // Since we always examine Intra16 first, we can overwrite *rd directly.
1018 SetRDScore(lambda, rd_cur);
1019 if (mode == 0 || rd_cur->score < rd_best->score) {
1020 SwapModeScore(&rd_cur, &rd_best);
1021 SwapOut(it);
1022 }
1023 }
1024 if (rd_best != rd) {
1025 memcpy(rd, rd_best, sizeof(*rd));
1026 }
1027 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1028 VP8SetIntra16Mode(it, rd->mode_i16);
1029
1030 // we have a blocky macroblock (only DCs are non-zero) with fairly high
1031 // distortion, record max delta so we can later adjust the minimal filtering
1032 // strength needed to smooth these blocks out.
1033 if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1034 StoreMaxDelta(dqm, rd->y_dc_levels);
1035 }
1036 }
1037
1038 //------------------------------------------------------------------------------
1039
1040 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])1041 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1042 const uint8_t modes[16]) {
1043 const int preds_w = it->enc_->preds_w_;
1044 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1045 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1046 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1047 return VP8FixedCostsI4[top][left];
1048 }
1049
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)1050 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1051 const VP8Encoder* const enc = it->enc_;
1052 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1053 const int lambda = dqm->lambda_i4_;
1054 const int tlambda = dqm->tlambda_;
1055 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1056 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1057 int total_header_bits = 0;
1058 VP8ModeScore rd_best;
1059
1060 if (enc->max_i4_header_bits_ == 0) {
1061 return 0;
1062 }
1063
1064 InitScore(&rd_best);
1065 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1066 SetRDScore(dqm->lambda_mode_, &rd_best);
1067 VP8IteratorStartI4(it);
1068 do {
1069 const int kNumBlocks = 1;
1070 VP8ModeScore rd_i4;
1071 int mode;
1072 int best_mode = -1;
1073 const uint8_t* const src = src0 + VP8Scan[it->i4_];
1074 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1075 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1076 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1077
1078 InitScore(&rd_i4);
1079 VP8MakeIntra4Preds(it);
1080 for (mode = 0; mode < NUM_BMODES; ++mode) {
1081 VP8ModeScore rd_tmp;
1082 int16_t tmp_levels[16];
1083
1084 // Reconstruct
1085 rd_tmp.nz =
1086 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1087
1088 // Compute RD-score
1089 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1090 rd_tmp.SD =
1091 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1092 : 0;
1093 rd_tmp.H = mode_costs[mode];
1094
1095 // Add flatness penalty, to avoid flat area to be mispredicted
1096 // by a complex mode.
1097 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1098 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1099 } else {
1100 rd_tmp.R = 0;
1101 }
1102
1103 // early-out check
1104 SetRDScore(lambda, &rd_tmp);
1105 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1106
1107 // finish computing score
1108 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1109 SetRDScore(lambda, &rd_tmp);
1110
1111 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1112 CopyScore(&rd_i4, &rd_tmp);
1113 best_mode = mode;
1114 SwapPtr(&tmp_dst, &best_block);
1115 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1116 sizeof(rd_best.y_ac_levels[it->i4_]));
1117 }
1118 }
1119 SetRDScore(dqm->lambda_mode_, &rd_i4);
1120 AddScore(&rd_best, &rd_i4);
1121 if (rd_best.score >= rd->score) {
1122 return 0;
1123 }
1124 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1125 if (total_header_bits > enc->max_i4_header_bits_) {
1126 return 0;
1127 }
1128 // Copy selected samples if not in the right place already.
1129 if (best_block != best_blocks + VP8Scan[it->i4_]) {
1130 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1131 }
1132 rd->modes_i4[it->i4_] = best_mode;
1133 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1134 } while (VP8IteratorRotateI4(it, best_blocks));
1135
1136 // finalize state
1137 CopyScore(rd, &rd_best);
1138 VP8SetIntra4Mode(it, rd->modes_i4);
1139 SwapOut(it);
1140 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1141 return 1; // select intra4x4 over intra16x16
1142 }
1143
1144 //------------------------------------------------------------------------------
1145
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1146 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1147 const int kNumBlocks = 8;
1148 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1149 const int lambda = dqm->lambda_uv_;
1150 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1151 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1152 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1153 uint8_t* dst = dst0;
1154 VP8ModeScore rd_best;
1155 int mode;
1156
1157 rd->mode_uv = -1;
1158 InitScore(&rd_best);
1159 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1160 VP8ModeScore rd_uv;
1161
1162 // Reconstruct
1163 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1164
1165 // Compute RD-score
1166 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1167 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1168 rd_uv.H = VP8FixedCostsUV[mode];
1169 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1170 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1171 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1172 }
1173
1174 SetRDScore(lambda, &rd_uv);
1175 if (mode == 0 || rd_uv.score < rd_best.score) {
1176 CopyScore(&rd_best, &rd_uv);
1177 rd->mode_uv = mode;
1178 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1179 if (it->top_derr_ != NULL) {
1180 memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1181 }
1182 SwapPtr(&dst, &tmp_dst);
1183 }
1184 }
1185 VP8SetIntraUVMode(it, rd->mode_uv);
1186 AddScore(rd, &rd_best);
1187 if (dst != dst0) { // copy 16x8 block if needed
1188 VP8Copy16x8(dst, dst0);
1189 }
1190 if (it->top_derr_ != NULL) { // store diffusion errors for next block
1191 StoreDiffusionErrors(it, rd);
1192 }
1193 }
1194
1195 //------------------------------------------------------------------------------
1196 // Final reconstruction and quantization.
1197
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1198 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1199 const VP8Encoder* const enc = it->enc_;
1200 const int is_i16 = (it->mb_->type_ == 1);
1201 int nz = 0;
1202
1203 if (is_i16) {
1204 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1205 } else {
1206 VP8IteratorStartI4(it);
1207 do {
1208 const int mode =
1209 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1210 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1211 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1212 VP8MakeIntra4Preds(it);
1213 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1214 src, dst, mode) << it->i4_;
1215 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1216 }
1217
1218 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1219 rd->nz = nz;
1220 }
1221
1222 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1223 static void RefineUsingDistortion(VP8EncIterator* const it,
1224 int try_both_modes, int refine_uv_mode,
1225 VP8ModeScore* const rd) {
1226 score_t best_score = MAX_COST;
1227 int nz = 0;
1228 int mode;
1229 int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1230
1231 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1232 // Some empiric constants, of approximate order of magnitude.
1233 const int lambda_d_i16 = 106;
1234 const int lambda_d_i4 = 11;
1235 const int lambda_d_uv = 120;
1236 score_t score_i4 = dqm->i4_penalty_;
1237 score_t i4_bit_sum = 0;
1238 const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1239 : MAX_COST; // no early-out allowed
1240
1241 if (is_i16) { // First, evaluate Intra16 distortion
1242 int best_mode = -1;
1243 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1244 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1245 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1246 const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1247 + VP8FixedCostsI16[mode] * lambda_d_i16;
1248 if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1249 continue;
1250 }
1251
1252 if (score < best_score) {
1253 best_mode = mode;
1254 best_score = score;
1255 }
1256 }
1257 if (it->x_ == 0 || it->y_ == 0) {
1258 // avoid starting a checkerboard resonance from the border. See bug #432.
1259 if (IsFlatSource16(src)) {
1260 best_mode = (it->x_ == 0) ? 0 : 2;
1261 try_both_modes = 0; // stick to i16
1262 }
1263 }
1264 VP8SetIntra16Mode(it, best_mode);
1265 // we'll reconstruct later, if i16 mode actually gets selected
1266 }
1267
1268 // Next, evaluate Intra4
1269 if (try_both_modes || !is_i16) {
1270 // We don't evaluate the rate here, but just account for it through a
1271 // constant penalty (i4 mode usually needs more bits compared to i16).
1272 is_i16 = 0;
1273 VP8IteratorStartI4(it);
1274 do {
1275 int best_i4_mode = -1;
1276 score_t best_i4_score = MAX_COST;
1277 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1278 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1279
1280 VP8MakeIntra4Preds(it);
1281 for (mode = 0; mode < NUM_BMODES; ++mode) {
1282 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1283 const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1284 + mode_costs[mode] * lambda_d_i4;
1285 if (score < best_i4_score) {
1286 best_i4_mode = mode;
1287 best_i4_score = score;
1288 }
1289 }
1290 i4_bit_sum += mode_costs[best_i4_mode];
1291 rd->modes_i4[it->i4_] = best_i4_mode;
1292 score_i4 += best_i4_score;
1293 if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1294 // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1295 is_i16 = 1;
1296 break;
1297 } else { // reconstruct partial block inside yuv_out2_ buffer
1298 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1299 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1300 src, tmp_dst, best_i4_mode) << it->i4_;
1301 }
1302 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1303 }
1304
1305 // Final reconstruction, depending on which mode is selected.
1306 if (!is_i16) {
1307 VP8SetIntra4Mode(it, rd->modes_i4);
1308 SwapOut(it);
1309 best_score = score_i4;
1310 } else {
1311 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1312 }
1313
1314 // ... and UV!
1315 if (refine_uv_mode) {
1316 int best_mode = -1;
1317 score_t best_uv_score = MAX_COST;
1318 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1319 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1320 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1321 const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1322 + VP8FixedCostsUV[mode] * lambda_d_uv;
1323 if (score < best_uv_score) {
1324 best_mode = mode;
1325 best_uv_score = score;
1326 }
1327 }
1328 VP8SetIntraUVMode(it, best_mode);
1329 }
1330 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1331
1332 rd->nz = nz;
1333 rd->score = best_score;
1334 }
1335
1336 //------------------------------------------------------------------------------
1337 // Entry point
1338
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1339 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1340 VP8RDLevel rd_opt) {
1341 int is_skipped;
1342 const int method = it->enc_->method_;
1343
1344 InitScore(rd);
1345
1346 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1347 // Luma4x4 predictions needs to be done as-we-go.
1348 VP8MakeLuma16Preds(it);
1349 VP8MakeChroma8Preds(it);
1350
1351 if (rd_opt > RD_OPT_NONE) {
1352 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1353 PickBestIntra16(it, rd);
1354 if (method >= 2) {
1355 PickBestIntra4(it, rd);
1356 }
1357 PickBestUV(it, rd);
1358 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1359 it->do_trellis_ = 1;
1360 SimpleQuantize(it, rd);
1361 }
1362 } else {
1363 // At this point we have heuristically decided intra16 / intra4.
1364 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1365 // For method <= 1, we don't re-examine the decision but just go ahead with
1366 // quantization/reconstruction.
1367 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1368 }
1369 is_skipped = (rd->nz == 0);
1370 VP8SetSkip(it, is_skipped);
1371 return is_skipped;
1372 }
1373