1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h> // for abs()
17
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21
22 #define DO_TRELLIS_I4 1
23 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26
27 #define MID_ALPHA 64 // neutral value for susceptibility
28 #define MIN_ALPHA 30 // lowest usable value for susceptibility
29 #define MAX_ALPHA 100 // higher meaningful value for susceptibility
30
31 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
32 // power-law modulation. Must be strictly less than 1.
33
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
37 #define FLATNESS_LIMIT_I4 3 // I4 mode
38 #define FLATNESS_LIMIT_UV 2 // UV mode
39 #define FLATNESS_PENALTY 140 // roughly ~1bit per block
40
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43 #define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
44
45 // #define DEBUG_BLOCK
46
47 //------------------------------------------------------------------------------
48
49 #if defined(DEBUG_BLOCK)
50
51 #include <stdio.h>
52 #include <stdlib.h>
53
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55 const VP8ModeScore* const rd) {
56 int i, j;
57 const int is_i16 = (it->mb_->type_ == 1);
58 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62 printf("SOURCE / OUTPUT / ABS DELTA\n");
63 for (j = 0; j < 16; ++j) {
64 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65 printf(" ");
66 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67 printf(" ");
68 for (i = 0; i < 16; ++i) {
69 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70 }
71 printf("\n");
72 }
73 printf("\n"); // newline before the U/V block
74 for (j = 0; j < 8; ++j) {
75 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76 printf(" ");
77 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78 printf(" ");
79 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80 printf(" ");
81 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82 printf(" ");
83 for (i = 0; i < 8; ++i) {
84 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85 }
86 printf(" ");
87 for (i = 8; i < 16; ++i) {
88 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89 }
90 printf("\n");
91 }
92 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94 (int)rd->score);
95 if (is_i16) {
96 printf("Mode: %d\n", rd->mode_i16);
97 printf("y_dc_levels:");
98 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99 printf("\n");
100 } else {
101 printf("Modes[16]: ");
102 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103 printf("\n");
104 }
105 printf("y_ac_levels:\n");
106 for (j = 0; j < 16; ++j) {
107 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108 printf("%4d ", rd->y_ac_levels[j][i]);
109 }
110 printf("\n");
111 }
112 printf("\n");
113 printf("uv_levels (mode=%d):\n", rd->mode_uv);
114 for (j = 0; j < 8; ++j) {
115 for (i = 0; i < 16; ++i) {
116 printf("%4d ", rd->uv_levels[j][i]);
117 }
118 printf("\n");
119 }
120 }
121
122 #endif // DEBUG_BLOCK
123
124 //------------------------------------------------------------------------------
125
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127 return v < m ? m : v > M ? M : v;
128 }
129
130 static const uint8_t kZigzag[16] = {
131 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133
134 static const uint8_t kDcTable[128] = {
135 4, 5, 6, 7, 8, 9, 10, 10,
136 11, 12, 13, 14, 15, 16, 17, 17,
137 18, 19, 20, 20, 21, 21, 22, 22,
138 23, 23, 24, 25, 25, 26, 27, 28,
139 29, 30, 31, 32, 33, 34, 35, 36,
140 37, 37, 38, 39, 40, 41, 42, 43,
141 44, 45, 46, 46, 47, 48, 49, 50,
142 51, 52, 53, 54, 55, 56, 57, 58,
143 59, 60, 61, 62, 63, 64, 65, 66,
144 67, 68, 69, 70, 71, 72, 73, 74,
145 75, 76, 76, 77, 78, 79, 80, 81,
146 82, 83, 84, 85, 86, 87, 88, 89,
147 91, 93, 95, 96, 98, 100, 101, 102,
148 104, 106, 108, 110, 112, 114, 116, 118,
149 122, 124, 126, 128, 130, 132, 134, 136,
150 138, 140, 143, 145, 148, 151, 154, 157
151 };
152
153 static const uint16_t kAcTable[128] = {
154 4, 5, 6, 7, 8, 9, 10, 11,
155 12, 13, 14, 15, 16, 17, 18, 19,
156 20, 21, 22, 23, 24, 25, 26, 27,
157 28, 29, 30, 31, 32, 33, 34, 35,
158 36, 37, 38, 39, 40, 41, 42, 43,
159 44, 45, 46, 47, 48, 49, 50, 51,
160 52, 53, 54, 55, 56, 57, 58, 60,
161 62, 64, 66, 68, 70, 72, 74, 76,
162 78, 80, 82, 84, 86, 88, 90, 92,
163 94, 96, 98, 100, 102, 104, 106, 108,
164 110, 112, 114, 116, 119, 122, 125, 128,
165 131, 134, 137, 140, 143, 146, 149, 152,
166 155, 158, 161, 164, 167, 170, 173, 177,
167 181, 185, 189, 193, 197, 201, 205, 209,
168 213, 217, 221, 225, 229, 234, 239, 245,
169 249, 254, 259, 264, 269, 274, 279, 284
170 };
171
172 static const uint16_t kAcTable2[128] = {
173 8, 8, 9, 10, 12, 13, 15, 17,
174 18, 20, 21, 23, 24, 26, 27, 29,
175 31, 32, 34, 35, 37, 38, 40, 41,
176 43, 44, 46, 48, 49, 51, 52, 54,
177 55, 57, 58, 60, 62, 63, 65, 66,
178 68, 69, 71, 72, 74, 75, 77, 79,
179 80, 82, 83, 85, 86, 88, 89, 93,
180 96, 99, 102, 105, 108, 111, 114, 117,
181 120, 124, 127, 130, 133, 136, 139, 142,
182 145, 148, 151, 155, 158, 161, 164, 167,
183 170, 173, 176, 179, 184, 189, 193, 198,
184 203, 207, 212, 217, 221, 226, 230, 235,
185 240, 244, 249, 254, 258, 263, 268, 274,
186 280, 286, 292, 299, 305, 311, 317, 323,
187 330, 336, 342, 348, 354, 362, 370, 379,
188 385, 393, 401, 409, 416, 424, 432, 440
189 };
190
191 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
192 { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199 0, 30, 60, 90,
200 30, 60, 90, 90,
201 60, 90, 90, 90,
202 90, 90, 90, 90
203 };
204
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210 int i, sum;
211 for (i = 0; i < 2; ++i) {
212 const int is_ac_coeff = (i > 0);
213 const int bias = kBiasMatrices[type][is_ac_coeff];
214 m->iq_[i] = (1 << QFIX) / m->q_[i];
215 m->bias_[i] = BIAS(bias);
216 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217 // * zero if coeff <= zthresh
218 // * non-zero if coeff > zthresh
219 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220 }
221 for (i = 2; i < 16; ++i) {
222 m->q_[i] = m->q_[1];
223 m->iq_[i] = m->iq_[1];
224 m->bias_[i] = m->bias_[1];
225 m->zthresh_[i] = m->zthresh_[1];
226 }
227 for (sum = 0, i = 0; i < 16; ++i) {
228 if (type == 0) { // we only use sharpening for AC luma coeffs
229 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230 } else {
231 m->sharpen_[i] = 0;
232 }
233 sum += m->q_[i];
234 }
235 return (sum + 8) >> 4;
236 }
237
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241 int i;
242 const int tlambda_scale =
243 (enc->method_ >= 4) ? enc->config_->sns_strength
244 : 0;
245 const int num_segments = enc->segment_hdr_.num_segments_;
246 for (i = 0; i < num_segments; ++i) {
247 VP8SegmentInfo* const m = &enc->dqm_[i];
248 const int q = m->quant_;
249 int q_i4, q_i16, q_uv;
250 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
252
253 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255
256 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258
259 q_i4 = ExpandMatrix(&m->y1_, 0);
260 q_i16 = ExpandMatrix(&m->y2_, 1);
261 q_uv = ExpandMatrix(&m->uv_, 2);
262
263 m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
264 m->lambda_i16_ = (3 * q_i16 * q_i16);
265 m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
266 m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
267 m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
268 m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269 m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
270 m->tlambda_ = (tlambda_scale * q_i4) >> 5;
271
272 // none of these constants should be < 1
273 CheckLambdaValue(&m->lambda_i4_);
274 CheckLambdaValue(&m->lambda_i16_);
275 CheckLambdaValue(&m->lambda_uv_);
276 CheckLambdaValue(&m->lambda_mode_);
277 CheckLambdaValue(&m->lambda_trellis_i4_);
278 CheckLambdaValue(&m->lambda_trellis_i16_);
279 CheckLambdaValue(&m->lambda_trellis_uv_);
280 CheckLambdaValue(&m->tlambda_);
281
282 m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
283 m->max_edge_ = 0;
284
285 m->i4_penalty_ = 1000 * q_i4 * q_i4;
286 }
287 }
288
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297 int i;
298 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299 const int level0 = 5 * enc->config_->filter_strength;
300 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301 VP8SegmentInfo* const m = &enc->dqm_[i];
302 // We focus on the quantization of AC coeffs.
303 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304 const int base_strength =
305 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306 // Segments with lower complexity ('beta') will be less filtered.
307 const int f = base_strength * level0 / (256 + m->beta_);
308 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309 }
310 // We record the initial strength (mainly for the case of 1-segment only).
311 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315
316 //------------------------------------------------------------------------------
317
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328 // The file size roughly scales as pow(quantizer, 3.). Actually, the
329 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330 // in the mid-quant range. So we scale the compressibility inversely to
331 // this power-law: quant ~= compression ^ 1/3. This law holds well for
332 // low quant. Finer modeling for high-quant would make use of kAcTable[]
333 // more explicitly.
334 const double v = pow(linear_c, 1 / 3.);
335 return v;
336 }
337
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339 // We map the complexity 'alpha' and quality setting 'c' to a compression
340 // exponent empirically matched to the compression curve of libjpeg6b.
341 // On average, the WebP output size will be roughly similar to that of a
342 // JPEG file compressed with same quality factor.
343 const double amin = 0.30;
344 const double amax = 0.85;
345 const double exp_min = 0.4;
346 const double exp_max = 0.9;
347 const double slope = (exp_min - exp_max) / (amax - amin);
348 // Linearly interpolate 'expn' from exp_min to exp_max
349 // in the [amin, amax] range.
350 const double expn = (alpha > amax) ? exp_min
351 : (alpha < amin) ? exp_max
352 : exp_max + slope * (alpha - amin);
353 const double v = pow(c, expn);
354 return v;
355 }
356
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358 const VP8SegmentInfo* const S2) {
359 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364 // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365 // explicit check is needed to avoid a spurious warning about 'i' exceeding
366 // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367 const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368 ? enc->segment_hdr_.num_segments_
369 : NUM_MB_SEGMENTS;
370 int num_final_segments = 1;
371 int s1, s2;
372 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
373 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374 int found = 0;
375 // check if we already have similar segment
376 for (s2 = 0; s2 < num_final_segments; ++s2) {
377 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378 if (SegmentsAreEquivalent(S1, S2)) {
379 found = 1;
380 break;
381 }
382 }
383 map[s1] = s2;
384 if (!found) {
385 if (num_final_segments != s1) {
386 enc->dqm_[num_final_segments] = enc->dqm_[s1];
387 }
388 ++num_final_segments;
389 }
390 }
391 if (num_final_segments < num_segments) { // Remap
392 int i = enc->mb_w_ * enc->mb_h_;
393 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394 enc->segment_hdr_.num_segments_ = num_final_segments;
395 // Replicate the trailing segment infos (it's mostly cosmetics)
396 for (i = num_final_segments; i < num_segments; ++i) {
397 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398 }
399 }
400 }
401
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403 int i;
404 int dq_uv_ac, dq_uv_dc;
405 const int num_segments = enc->segment_hdr_.num_segments_;
406 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407 const double Q = quality / 100.;
408 const double c_base = enc->config_->emulate_jpeg_size ?
409 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410 QualityToCompression(Q);
411 for (i = 0; i < num_segments; ++i) {
412 // We modulate the base coefficient to accommodate for the quantization
413 // susceptibility and allow denser segments to be quantized more.
414 const double expn = 1. - amp * enc->dqm_[i].alpha_;
415 const double c = pow(c_base, expn);
416 const int q = (int)(127. * (1. - c));
417 assert(expn > 0.);
418 enc->dqm_[i].quant_ = clip(q, 0, 127);
419 }
420
421 // purely indicative in the bitstream (except for the 1-segment case)
422 enc->base_quant_ = enc->dqm_[0].quant_;
423
424 // fill-in values for the unused segments (required by the syntax)
425 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426 enc->dqm_[i].quant_ = enc->base_quant_;
427 }
428
429 // uv_alpha_ is normally spread around ~60. The useful range is
430 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433 / (MAX_ALPHA - MIN_ALPHA);
434 // we rescale by the user-defined strength of adaptation
435 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436 // and make it safe.
437 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438 // We also boost the dc-uv-quant a little, based on sns-strength, since
439 // U/V channels are quite more reactive to high quants (flat DC-blocks
440 // tend to appear, and are unpleasant).
441 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
443
444 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
445 enc->dq_y2_dc_ = 0;
446 enc->dq_y2_ac_ = 0;
447 enc->dq_uv_dc_ = dq_uv_dc;
448 enc->dq_uv_ac_ = dq_uv_ac;
449
450 SetupFilterStrength(enc); // initialize segments' filtering, eventually
451
452 if (num_segments > 1) SimplifySegments(enc);
453
454 SetupMatrices(enc); // finalize quantization matrices
455 }
456
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472 VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478 VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480
481 // Form all the ten Intra4x4 predictions in the yuv_p_ cache
482 // for the 4x4 block it->i4_
MakeIntra4Preds(const VP8EncIterator * const it)483 static void MakeIntra4Preds(const VP8EncIterator* const it) {
484 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
485 }
486
487 //------------------------------------------------------------------------------
488 // Quantize
489
490 // Layout:
491 // +----+----+
492 // |YYYY|UUVV| 0
493 // |YYYY|UUVV| 4
494 // |YYYY|....| 8
495 // |YYYY|....| 12
496 // +----+----+
497
498 const uint16_t VP8Scan[16] = { // Luma
499 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
500 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
501 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
502 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
503 };
504
505 static const uint16_t VP8ScanUV[4 + 4] = {
506 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
507 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
508 };
509
510 //------------------------------------------------------------------------------
511 // Distortion measurement
512
513 static const uint16_t kWeightY[16] = {
514 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
515 };
516
517 static const uint16_t kWeightTrellis[16] = {
518 #if USE_TDISTO == 0
519 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
520 #else
521 30, 27, 19, 11,
522 27, 24, 17, 10,
523 19, 17, 12, 8,
524 11, 10, 8, 6
525 #endif
526 };
527
528 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)529 static void InitScore(VP8ModeScore* const rd) {
530 rd->D = 0;
531 rd->SD = 0;
532 rd->R = 0;
533 rd->H = 0;
534 rd->nz = 0;
535 rd->score = MAX_COST;
536 }
537
CopyScore(VP8ModeScore * WEBP_RESTRICT const dst,const VP8ModeScore * WEBP_RESTRICT const src)538 static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
539 const VP8ModeScore* WEBP_RESTRICT const src) {
540 dst->D = src->D;
541 dst->SD = src->SD;
542 dst->R = src->R;
543 dst->H = src->H;
544 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
545 dst->score = src->score;
546 }
547
AddScore(VP8ModeScore * WEBP_RESTRICT const dst,const VP8ModeScore * WEBP_RESTRICT const src)548 static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
549 const VP8ModeScore* WEBP_RESTRICT const src) {
550 dst->D += src->D;
551 dst->SD += src->SD;
552 dst->R += src->R;
553 dst->H += src->H;
554 dst->nz |= src->nz; // here, new nz bits are accumulated.
555 dst->score += src->score;
556 }
557
558 //------------------------------------------------------------------------------
559 // Performs trellis-optimized quantization.
560
561 // Trellis node
562 typedef struct {
563 int8_t prev; // best previous node
564 int8_t sign; // sign of coeff_i
565 int16_t level; // level
566 } Node;
567
568 // Score state
569 typedef struct {
570 score_t score; // partial RD score
571 const uint16_t* costs; // shortcut to cost tables
572 } ScoreState;
573
574 // If a coefficient was quantized to a value Q (using a neutral bias),
575 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
576 // We don't test negative values though.
577 #define MIN_DELTA 0 // how much lower level to try
578 #define MAX_DELTA 1 // how much higher
579 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
580 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
581 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
582
SetRDScore(int lambda,VP8ModeScore * const rd)583 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
584 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
585 }
586
RDScoreTrellis(int lambda,score_t rate,score_t distortion)587 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
588 score_t distortion) {
589 return rate * lambda + RD_DISTO_MULT * distortion;
590 }
591
592 // Coefficient type.
593 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
594
TrellisQuantizeBlock(const VP8Encoder * WEBP_RESTRICT const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * WEBP_RESTRICT const mtx,int lambda)595 static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
596 int16_t in[16], int16_t out[16],
597 int ctx0, int coeff_type,
598 const VP8Matrix* WEBP_RESTRICT const mtx,
599 int lambda) {
600 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
601 CostArrayPtr const costs =
602 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
603 const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
604 Node nodes[16][NUM_NODES];
605 ScoreState score_states[2][NUM_NODES];
606 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
607 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
608 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
609 score_t best_score;
610 int n, m, p, last;
611
612 {
613 score_t cost;
614 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
615 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
616
617 // compute the position of the last interesting coefficient
618 last = first - 1;
619 for (n = 15; n >= first; --n) {
620 const int j = kZigzag[n];
621 const int err = in[j] * in[j];
622 if (err > thresh) {
623 last = n;
624 break;
625 }
626 }
627 // we don't need to go inspect up to n = 16 coeffs. We can just go up
628 // to last + 1 (inclusive) without losing much.
629 if (last < 15) ++last;
630
631 // compute 'skip' score. This is the max score one can do.
632 cost = VP8BitCost(0, last_proba);
633 best_score = RDScoreTrellis(lambda, cost, 0);
634
635 // initialize source node.
636 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
637 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
638 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
639 ss_cur[m].costs = costs[first][ctx0];
640 }
641 }
642
643 // traverse trellis.
644 for (n = first; n <= last; ++n) {
645 const int j = kZigzag[n];
646 const uint32_t Q = mtx->q_[j];
647 const uint32_t iQ = mtx->iq_[j];
648 const uint32_t B = BIAS(0x00); // neutral bias
649 // note: it's important to take sign of the _original_ coeff,
650 // so we don't have to consider level < 0 afterward.
651 const int sign = (in[j] < 0);
652 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
653 int level0 = QUANTDIV(coeff0, iQ, B);
654 int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
655 if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
656 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
657
658 { // Swap current and previous score states
659 ScoreState* const tmp = ss_cur;
660 ss_cur = ss_prev;
661 ss_prev = tmp;
662 }
663
664 // test all alternate level values around level0.
665 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
666 Node* const cur = &NODE(n, m);
667 const int level = level0 + m;
668 const int ctx = (level > 2) ? 2 : level;
669 const int band = VP8EncBands[n + 1];
670 score_t base_score;
671 score_t best_cur_score;
672 int best_prev;
673 score_t cost, score;
674
675 ss_cur[m].costs = costs[n + 1][ctx];
676 if (level < 0 || level > thresh_level) {
677 ss_cur[m].score = MAX_COST;
678 // Node is dead.
679 continue;
680 }
681
682 {
683 // Compute delta_error = how much coding this level will
684 // subtract to max_error as distortion.
685 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
686 const int new_error = coeff0 - level * Q;
687 const int delta_error =
688 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
689 base_score = RDScoreTrellis(lambda, 0, delta_error);
690 }
691
692 // Inspect all possible non-dead predecessors. Retain only the best one.
693 // The base_score is added to all scores so it is only added for the final
694 // value after the loop.
695 cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
696 best_cur_score =
697 ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
698 best_prev = -MIN_DELTA;
699 for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
700 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
701 // eliminated since their score can't be better than the current best.
702 cost = VP8LevelCost(ss_prev[p].costs, level);
703 // Examine node assuming it's a non-terminal one.
704 score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
705 if (score < best_cur_score) {
706 best_cur_score = score;
707 best_prev = p;
708 }
709 }
710 best_cur_score += base_score;
711 // Store best finding in current node.
712 cur->sign = sign;
713 cur->level = level;
714 cur->prev = best_prev;
715 ss_cur[m].score = best_cur_score;
716
717 // Now, record best terminal node (and thus best entry in the graph).
718 if (level != 0 && best_cur_score < best_score) {
719 const score_t last_pos_cost =
720 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
721 const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
722 score = best_cur_score + last_pos_score;
723 if (score < best_score) {
724 best_score = score;
725 best_path[0] = n; // best eob position
726 best_path[1] = m; // best node index
727 best_path[2] = best_prev; // best predecessor
728 }
729 }
730 }
731 }
732
733 // Fresh start
734 // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
735 if (coeff_type == TYPE_I16_AC) {
736 memset(in + 1, 0, 15 * sizeof(*in));
737 memset(out + 1, 0, 15 * sizeof(*out));
738 } else {
739 memset(in, 0, 16 * sizeof(*in));
740 memset(out, 0, 16 * sizeof(*out));
741 }
742 if (best_path[0] == -1) {
743 return 0; // skip!
744 }
745
746 {
747 // Unwind the best path.
748 // Note: best-prev on terminal node is not necessarily equal to the
749 // best_prev for non-terminal. So we patch best_path[2] in.
750 int nz = 0;
751 int best_node = best_path[1];
752 n = best_path[0];
753 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
754
755 for (; n >= first; --n) {
756 const Node* const node = &NODE(n, best_node);
757 const int j = kZigzag[n];
758 out[n] = node->sign ? -node->level : node->level;
759 nz |= node->level;
760 in[j] = out[n] * mtx->q_[j];
761 best_node = node->prev;
762 }
763 return (nz != 0);
764 }
765 }
766
767 #undef NODE
768
769 //------------------------------------------------------------------------------
770 // Performs: difference, transform, quantize, back-transform, add
771 // all at once. Output is the reconstructed block in *yuv_out, and the
772 // quantized levels in *levels.
773
ReconstructIntra16(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,uint8_t * WEBP_RESTRICT const yuv_out,int mode)774 static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
775 VP8ModeScore* WEBP_RESTRICT const rd,
776 uint8_t* WEBP_RESTRICT const yuv_out,
777 int mode) {
778 const VP8Encoder* const enc = it->enc_;
779 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
780 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
781 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
782 int nz = 0;
783 int n;
784 int16_t tmp[16][16], dc_tmp[16];
785
786 for (n = 0; n < 16; n += 2) {
787 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
788 }
789 VP8FTransformWHT(tmp[0], dc_tmp);
790 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
791
792 if (DO_TRELLIS_I16 && it->do_trellis_) {
793 int x, y;
794 VP8IteratorNzToBytes(it);
795 for (y = 0, n = 0; y < 4; ++y) {
796 for (x = 0; x < 4; ++x, ++n) {
797 const int ctx = it->top_nz_[x] + it->left_nz_[y];
798 const int non_zero = TrellisQuantizeBlock(
799 enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
800 dqm->lambda_trellis_i16_);
801 it->top_nz_[x] = it->left_nz_[y] = non_zero;
802 rd->y_ac_levels[n][0] = 0;
803 nz |= non_zero << n;
804 }
805 }
806 } else {
807 for (n = 0; n < 16; n += 2) {
808 // Zero-out the first coeff, so that: a) nz is correct below, and
809 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
810 tmp[n][0] = tmp[n + 1][0] = 0;
811 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
812 assert(rd->y_ac_levels[n + 0][0] == 0);
813 assert(rd->y_ac_levels[n + 1][0] == 0);
814 }
815 }
816
817 // Transform back
818 VP8TransformWHT(dc_tmp, tmp[0]);
819 for (n = 0; n < 16; n += 2) {
820 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
821 }
822
823 return nz;
824 }
825
ReconstructIntra4(VP8EncIterator * WEBP_RESTRICT const it,int16_t levels[16],const uint8_t * WEBP_RESTRICT const src,uint8_t * WEBP_RESTRICT const yuv_out,int mode)826 static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
827 int16_t levels[16],
828 const uint8_t* WEBP_RESTRICT const src,
829 uint8_t* WEBP_RESTRICT const yuv_out,
830 int mode) {
831 const VP8Encoder* const enc = it->enc_;
832 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
833 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
834 int nz = 0;
835 int16_t tmp[16];
836
837 VP8FTransform(src, ref, tmp);
838 if (DO_TRELLIS_I4 && it->do_trellis_) {
839 const int x = it->i4_ & 3, y = it->i4_ >> 2;
840 const int ctx = it->top_nz_[x] + it->left_nz_[y];
841 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
842 dqm->lambda_trellis_i4_);
843 } else {
844 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
845 }
846 VP8ITransform(ref, tmp, yuv_out, 0);
847 return nz;
848 }
849
850 //------------------------------------------------------------------------------
851 // DC-error diffusion
852
853 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
854 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
855 #define C1 7 // fraction of error sent to the 4x4 block below
856 #define C2 8 // fraction of error sent to the 4x4 block on the right
857 #define DSHIFT 4
858 #define DSCALE 1 // storage descaling, needed to make the error fit int8_t
859
860 // Quantize as usual, but also compute and return the quantization error.
861 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * WEBP_RESTRICT const v,const VP8Matrix * WEBP_RESTRICT const mtx)862 static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
863 const VP8Matrix* WEBP_RESTRICT const mtx) {
864 int V = *v;
865 const int sign = (V < 0);
866 if (sign) V = -V;
867 if (V > (int)mtx->zthresh_[0]) {
868 const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
869 const int err = (V - qV);
870 *v = sign ? -qV : qV;
871 return (sign ? -err : err) >> DSCALE;
872 }
873 *v = 0;
874 return (sign ? -V : V) >> DSCALE;
875 }
876
CorrectDCValues(const VP8EncIterator * WEBP_RESTRICT const it,const VP8Matrix * WEBP_RESTRICT const mtx,int16_t tmp[][16],VP8ModeScore * WEBP_RESTRICT const rd)877 static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
878 const VP8Matrix* WEBP_RESTRICT const mtx,
879 int16_t tmp[][16],
880 VP8ModeScore* WEBP_RESTRICT const rd) {
881 // | top[0] | top[1]
882 // --------+--------+---------
883 // left[0] | tmp[0] tmp[1] <-> err0 err1
884 // left[1] | tmp[2] tmp[3] err2 err3
885 //
886 // Final errors {err1,err2,err3} are preserved and later restored
887 // as top[]/left[] on the next block.
888 int ch;
889 for (ch = 0; ch <= 1; ++ch) {
890 const int8_t* const top = it->top_derr_[it->x_][ch];
891 const int8_t* const left = it->left_derr_[ch];
892 int16_t (* const c)[16] = &tmp[ch * 4];
893 int err0, err1, err2, err3;
894 c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
895 err0 = QuantizeSingle(&c[0][0], mtx);
896 c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
897 err1 = QuantizeSingle(&c[1][0], mtx);
898 c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
899 err2 = QuantizeSingle(&c[2][0], mtx);
900 c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
901 err3 = QuantizeSingle(&c[3][0], mtx);
902 // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
903 // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
904 assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
905 rd->derr[ch][0] = (int8_t)err1;
906 rd->derr[ch][1] = (int8_t)err2;
907 rd->derr[ch][2] = (int8_t)err3;
908 }
909 }
910
StoreDiffusionErrors(VP8EncIterator * WEBP_RESTRICT const it,const VP8ModeScore * WEBP_RESTRICT const rd)911 static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
912 const VP8ModeScore* WEBP_RESTRICT const rd) {
913 int ch;
914 for (ch = 0; ch <= 1; ++ch) {
915 int8_t* const top = it->top_derr_[it->x_][ch];
916 int8_t* const left = it->left_derr_[ch];
917 left[0] = rd->derr[ch][0]; // restore err1
918 left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
919 top[0] = rd->derr[ch][1]; // ... err2
920 top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
921 }
922 }
923
924 #undef C1
925 #undef C2
926 #undef DSHIFT
927 #undef DSCALE
928
929 //------------------------------------------------------------------------------
930
ReconstructUV(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,uint8_t * WEBP_RESTRICT const yuv_out,int mode)931 static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
932 VP8ModeScore* WEBP_RESTRICT const rd,
933 uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
934 const VP8Encoder* const enc = it->enc_;
935 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
936 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
937 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
938 int nz = 0;
939 int n;
940 int16_t tmp[8][16];
941
942 for (n = 0; n < 8; n += 2) {
943 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
944 }
945 if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
946
947 if (DO_TRELLIS_UV && it->do_trellis_) {
948 int ch, x, y;
949 for (ch = 0, n = 0; ch <= 2; ch += 2) {
950 for (y = 0; y < 2; ++y) {
951 for (x = 0; x < 2; ++x, ++n) {
952 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
953 const int non_zero = TrellisQuantizeBlock(
954 enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
955 dqm->lambda_trellis_uv_);
956 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
957 nz |= non_zero << n;
958 }
959 }
960 }
961 } else {
962 for (n = 0; n < 8; n += 2) {
963 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
964 }
965 }
966
967 for (n = 0; n < 8; n += 2) {
968 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
969 }
970 return (nz << 16);
971 }
972
973 //------------------------------------------------------------------------------
974 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
975 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
976
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])977 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
978 // We look at the first three AC coefficients to determine what is the average
979 // delta between each sub-4x4 block.
980 const int v0 = abs(DCs[1]);
981 const int v1 = abs(DCs[2]);
982 const int v2 = abs(DCs[4]);
983 int max_v = (v1 > v0) ? v1 : v0;
984 max_v = (v2 > max_v) ? v2 : max_v;
985 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
986 }
987
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)988 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
989 VP8ModeScore* const tmp = *a;
990 *a = *b;
991 *b = tmp;
992 }
993
SwapPtr(uint8_t ** a,uint8_t ** b)994 static void SwapPtr(uint8_t** a, uint8_t** b) {
995 uint8_t* const tmp = *a;
996 *a = *b;
997 *b = tmp;
998 }
999
SwapOut(VP8EncIterator * const it)1000 static void SwapOut(VP8EncIterator* const it) {
1001 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1002 }
1003
PickBestIntra16(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT rd)1004 static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1005 VP8ModeScore* WEBP_RESTRICT rd) {
1006 const int kNumBlocks = 16;
1007 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1008 const int lambda = dqm->lambda_i16_;
1009 const int tlambda = dqm->tlambda_;
1010 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1011 VP8ModeScore rd_tmp;
1012 VP8ModeScore* rd_cur = &rd_tmp;
1013 VP8ModeScore* rd_best = rd;
1014 int mode;
1015 int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1016
1017 rd->mode_i16 = -1;
1018 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1019 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
1020 rd_cur->mode_i16 = mode;
1021
1022 // Reconstruct
1023 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1024
1025 // Measure RD-score
1026 rd_cur->D = VP8SSE16x16(src, tmp_dst);
1027 rd_cur->SD =
1028 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1029 rd_cur->H = VP8FixedCostsI16[mode];
1030 rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1031 if (is_flat) {
1032 // refine the first impression (which was in pixel space)
1033 is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1034 if (is_flat) {
1035 // Block is very flat. We put emphasis on the distortion being very low!
1036 rd_cur->D *= 2;
1037 rd_cur->SD *= 2;
1038 }
1039 }
1040
1041 // Since we always examine Intra16 first, we can overwrite *rd directly.
1042 SetRDScore(lambda, rd_cur);
1043 if (mode == 0 || rd_cur->score < rd_best->score) {
1044 SwapModeScore(&rd_cur, &rd_best);
1045 SwapOut(it);
1046 }
1047 }
1048 if (rd_best != rd) {
1049 memcpy(rd, rd_best, sizeof(*rd));
1050 }
1051 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1052 VP8SetIntra16Mode(it, rd->mode_i16);
1053
1054 // we have a blocky macroblock (only DCs are non-zero) with fairly high
1055 // distortion, record max delta so we can later adjust the minimal filtering
1056 // strength needed to smooth these blocks out.
1057 if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1058 StoreMaxDelta(dqm, rd->y_dc_levels);
1059 }
1060 }
1061
1062 //------------------------------------------------------------------------------
1063
1064 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * WEBP_RESTRICT const it,const uint8_t modes[16])1065 static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1066 const uint8_t modes[16]) {
1067 const int preds_w = it->enc_->preds_w_;
1068 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1069 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1070 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1071 return VP8FixedCostsI4[top][left];
1072 }
1073
PickBestIntra4(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1074 static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1075 VP8ModeScore* WEBP_RESTRICT const rd) {
1076 const VP8Encoder* const enc = it->enc_;
1077 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1078 const int lambda = dqm->lambda_i4_;
1079 const int tlambda = dqm->tlambda_;
1080 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1081 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1082 int total_header_bits = 0;
1083 VP8ModeScore rd_best;
1084
1085 if (enc->max_i4_header_bits_ == 0) {
1086 return 0;
1087 }
1088
1089 InitScore(&rd_best);
1090 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1091 SetRDScore(dqm->lambda_mode_, &rd_best);
1092 VP8IteratorStartI4(it);
1093 do {
1094 const int kNumBlocks = 1;
1095 VP8ModeScore rd_i4;
1096 int mode;
1097 int best_mode = -1;
1098 const uint8_t* const src = src0 + VP8Scan[it->i4_];
1099 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1100 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1101 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1102
1103 InitScore(&rd_i4);
1104 MakeIntra4Preds(it);
1105 for (mode = 0; mode < NUM_BMODES; ++mode) {
1106 VP8ModeScore rd_tmp;
1107 int16_t tmp_levels[16];
1108
1109 // Reconstruct
1110 rd_tmp.nz =
1111 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1112
1113 // Compute RD-score
1114 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1115 rd_tmp.SD =
1116 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1117 : 0;
1118 rd_tmp.H = mode_costs[mode];
1119
1120 // Add flatness penalty, to avoid flat area to be mispredicted
1121 // by a complex mode.
1122 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1123 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1124 } else {
1125 rd_tmp.R = 0;
1126 }
1127
1128 // early-out check
1129 SetRDScore(lambda, &rd_tmp);
1130 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1131
1132 // finish computing score
1133 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1134 SetRDScore(lambda, &rd_tmp);
1135
1136 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1137 CopyScore(&rd_i4, &rd_tmp);
1138 best_mode = mode;
1139 SwapPtr(&tmp_dst, &best_block);
1140 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1141 sizeof(rd_best.y_ac_levels[it->i4_]));
1142 }
1143 }
1144 SetRDScore(dqm->lambda_mode_, &rd_i4);
1145 AddScore(&rd_best, &rd_i4);
1146 if (rd_best.score >= rd->score) {
1147 return 0;
1148 }
1149 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1150 if (total_header_bits > enc->max_i4_header_bits_) {
1151 return 0;
1152 }
1153 // Copy selected samples if not in the right place already.
1154 if (best_block != best_blocks + VP8Scan[it->i4_]) {
1155 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1156 }
1157 rd->modes_i4[it->i4_] = best_mode;
1158 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1159 } while (VP8IteratorRotateI4(it, best_blocks));
1160
1161 // finalize state
1162 CopyScore(rd, &rd_best);
1163 VP8SetIntra4Mode(it, rd->modes_i4);
1164 SwapOut(it);
1165 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1166 return 1; // select intra4x4 over intra16x16
1167 }
1168
1169 //------------------------------------------------------------------------------
1170
PickBestUV(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1171 static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1172 VP8ModeScore* WEBP_RESTRICT const rd) {
1173 const int kNumBlocks = 8;
1174 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1175 const int lambda = dqm->lambda_uv_;
1176 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1177 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1178 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1179 uint8_t* dst = dst0;
1180 VP8ModeScore rd_best;
1181 int mode;
1182
1183 rd->mode_uv = -1;
1184 InitScore(&rd_best);
1185 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1186 VP8ModeScore rd_uv;
1187
1188 // Reconstruct
1189 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1190
1191 // Compute RD-score
1192 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1193 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1194 rd_uv.H = VP8FixedCostsUV[mode];
1195 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1196 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1197 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1198 }
1199
1200 SetRDScore(lambda, &rd_uv);
1201 if (mode == 0 || rd_uv.score < rd_best.score) {
1202 CopyScore(&rd_best, &rd_uv);
1203 rd->mode_uv = mode;
1204 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1205 if (it->top_derr_ != NULL) {
1206 memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1207 }
1208 SwapPtr(&dst, &tmp_dst);
1209 }
1210 }
1211 VP8SetIntraUVMode(it, rd->mode_uv);
1212 AddScore(rd, &rd_best);
1213 if (dst != dst0) { // copy 16x8 block if needed
1214 VP8Copy16x8(dst, dst0);
1215 }
1216 if (it->top_derr_ != NULL) { // store diffusion errors for next block
1217 StoreDiffusionErrors(it, rd);
1218 }
1219 }
1220
1221 //------------------------------------------------------------------------------
1222 // Final reconstruction and quantization.
1223
SimpleQuantize(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1224 static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1225 VP8ModeScore* WEBP_RESTRICT const rd) {
1226 const VP8Encoder* const enc = it->enc_;
1227 const int is_i16 = (it->mb_->type_ == 1);
1228 int nz = 0;
1229
1230 if (is_i16) {
1231 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1232 } else {
1233 VP8IteratorStartI4(it);
1234 do {
1235 const int mode =
1236 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1237 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1238 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1239 MakeIntra4Preds(it);
1240 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1241 src, dst, mode) << it->i4_;
1242 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1243 }
1244
1245 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1246 rd->nz = nz;
1247 }
1248
1249 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * WEBP_RESTRICT const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * WEBP_RESTRICT const rd)1250 static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1251 int try_both_modes, int refine_uv_mode,
1252 VP8ModeScore* WEBP_RESTRICT const rd) {
1253 score_t best_score = MAX_COST;
1254 int nz = 0;
1255 int mode;
1256 int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1257
1258 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1259 // Some empiric constants, of approximate order of magnitude.
1260 const int lambda_d_i16 = 106;
1261 const int lambda_d_i4 = 11;
1262 const int lambda_d_uv = 120;
1263 score_t score_i4 = dqm->i4_penalty_;
1264 score_t i4_bit_sum = 0;
1265 const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1266 : MAX_COST; // no early-out allowed
1267
1268 if (is_i16) { // First, evaluate Intra16 distortion
1269 int best_mode = -1;
1270 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1271 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1272 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1273 const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1274 + VP8FixedCostsI16[mode] * lambda_d_i16;
1275 if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1276 continue;
1277 }
1278
1279 if (score < best_score) {
1280 best_mode = mode;
1281 best_score = score;
1282 }
1283 }
1284 if (it->x_ == 0 || it->y_ == 0) {
1285 // avoid starting a checkerboard resonance from the border. See bug #432.
1286 if (IsFlatSource16(src)) {
1287 best_mode = (it->x_ == 0) ? 0 : 2;
1288 try_both_modes = 0; // stick to i16
1289 }
1290 }
1291 VP8SetIntra16Mode(it, best_mode);
1292 // we'll reconstruct later, if i16 mode actually gets selected
1293 }
1294
1295 // Next, evaluate Intra4
1296 if (try_both_modes || !is_i16) {
1297 // We don't evaluate the rate here, but just account for it through a
1298 // constant penalty (i4 mode usually needs more bits compared to i16).
1299 is_i16 = 0;
1300 VP8IteratorStartI4(it);
1301 do {
1302 int best_i4_mode = -1;
1303 score_t best_i4_score = MAX_COST;
1304 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1305 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1306
1307 MakeIntra4Preds(it);
1308 for (mode = 0; mode < NUM_BMODES; ++mode) {
1309 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1310 const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1311 + mode_costs[mode] * lambda_d_i4;
1312 if (score < best_i4_score) {
1313 best_i4_mode = mode;
1314 best_i4_score = score;
1315 }
1316 }
1317 i4_bit_sum += mode_costs[best_i4_mode];
1318 rd->modes_i4[it->i4_] = best_i4_mode;
1319 score_i4 += best_i4_score;
1320 if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1321 // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1322 is_i16 = 1;
1323 break;
1324 } else { // reconstruct partial block inside yuv_out2_ buffer
1325 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1326 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1327 src, tmp_dst, best_i4_mode) << it->i4_;
1328 }
1329 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1330 }
1331
1332 // Final reconstruction, depending on which mode is selected.
1333 if (!is_i16) {
1334 VP8SetIntra4Mode(it, rd->modes_i4);
1335 SwapOut(it);
1336 best_score = score_i4;
1337 } else {
1338 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1339 }
1340
1341 // ... and UV!
1342 if (refine_uv_mode) {
1343 int best_mode = -1;
1344 score_t best_uv_score = MAX_COST;
1345 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1346 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1347 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1348 const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1349 + VP8FixedCostsUV[mode] * lambda_d_uv;
1350 if (score < best_uv_score) {
1351 best_mode = mode;
1352 best_uv_score = score;
1353 }
1354 }
1355 VP8SetIntraUVMode(it, best_mode);
1356 }
1357 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1358
1359 rd->nz = nz;
1360 rd->score = best_score;
1361 }
1362
1363 //------------------------------------------------------------------------------
1364 // Entry point
1365
VP8Decimate(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,VP8RDLevel rd_opt)1366 int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1367 VP8ModeScore* WEBP_RESTRICT const rd,
1368 VP8RDLevel rd_opt) {
1369 int is_skipped;
1370 const int method = it->enc_->method_;
1371
1372 InitScore(rd);
1373
1374 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1375 // Luma4x4 predictions needs to be done as-we-go.
1376 VP8MakeLuma16Preds(it);
1377 VP8MakeChroma8Preds(it);
1378
1379 if (rd_opt > RD_OPT_NONE) {
1380 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1381 PickBestIntra16(it, rd);
1382 if (method >= 2) {
1383 PickBestIntra4(it, rd);
1384 }
1385 PickBestUV(it, rd);
1386 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1387 it->do_trellis_ = 1;
1388 SimpleQuantize(it, rd);
1389 }
1390 } else {
1391 // At this point we have heuristically decided intra16 / intra4.
1392 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1393 // For method <= 1, we don't re-examine the decision but just go ahead with
1394 // quantization/reconstruction.
1395 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1396 }
1397 is_skipped = (rd->nz == 0);
1398 VP8SetSkip(it, is_skipped);
1399 return is_skipped;
1400 }
1401