• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21 
22 #define DO_TRELLIS_I4  1
23 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26 
27 #define MID_ALPHA 64      // neutral value for susceptibility
28 #define MIN_ALPHA 30      // lowest usable value for susceptibility
29 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
30 
31 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
32                           // power-law modulation. Must be strictly less than 1.
33 
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
37 #define FLATNESS_LIMIT_I4  3       // I4 mode
38 #define FLATNESS_LIMIT_UV  2       // UV mode
39 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
40 
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42 
43 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
44 
45 // #define DEBUG_BLOCK
46 
47 //------------------------------------------------------------------------------
48 
49 #if defined(DEBUG_BLOCK)
50 
51 #include <stdio.h>
52 #include <stdlib.h>
53 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55                            const VP8ModeScore* const rd) {
56   int i, j;
57   const int is_i16 = (it->mb_->type_ == 1);
58   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62   printf("SOURCE / OUTPUT / ABS DELTA\n");
63   for (j = 0; j < 16; ++j) {
64     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65     printf("     ");
66     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67     printf("     ");
68     for (i = 0; i < 16; ++i) {
69       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70     }
71     printf("\n");
72   }
73   printf("\n");   // newline before the U/V block
74   for (j = 0; j < 8; ++j) {
75     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76     printf(" ");
77     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78     printf("    ");
79     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80     printf(" ");
81     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82     printf("   ");
83     for (i = 0; i < 8; ++i) {
84       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85     }
86     printf(" ");
87     for (i = 8; i < 16; ++i) {
88       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89     }
90     printf("\n");
91   }
92   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94     (int)rd->score);
95   if (is_i16) {
96     printf("Mode: %d\n", rd->mode_i16);
97     printf("y_dc_levels:");
98     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99     printf("\n");
100   } else {
101     printf("Modes[16]: ");
102     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103     printf("\n");
104   }
105   printf("y_ac_levels:\n");
106   for (j = 0; j < 16; ++j) {
107     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108       printf("%4d ", rd->y_ac_levels[j][i]);
109     }
110     printf("\n");
111   }
112   printf("\n");
113   printf("uv_levels (mode=%d):\n", rd->mode_uv);
114   for (j = 0; j < 8; ++j) {
115     for (i = 0; i < 16; ++i) {
116       printf("%4d ", rd->uv_levels[j][i]);
117     }
118     printf("\n");
119   }
120 }
121 
122 #endif   // DEBUG_BLOCK
123 
124 //------------------------------------------------------------------------------
125 
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127   return v < m ? m : v > M ? M : v;
128 }
129 
130 static const uint8_t kZigzag[16] = {
131   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133 
134 static const uint8_t kDcTable[128] = {
135   4,     5,   6,   7,   8,   9,  10,  10,
136   11,   12,  13,  14,  15,  16,  17,  17,
137   18,   19,  20,  20,  21,  21,  22,  22,
138   23,   23,  24,  25,  25,  26,  27,  28,
139   29,   30,  31,  32,  33,  34,  35,  36,
140   37,   37,  38,  39,  40,  41,  42,  43,
141   44,   45,  46,  46,  47,  48,  49,  50,
142   51,   52,  53,  54,  55,  56,  57,  58,
143   59,   60,  61,  62,  63,  64,  65,  66,
144   67,   68,  69,  70,  71,  72,  73,  74,
145   75,   76,  76,  77,  78,  79,  80,  81,
146   82,   83,  84,  85,  86,  87,  88,  89,
147   91,   93,  95,  96,  98, 100, 101, 102,
148   104, 106, 108, 110, 112, 114, 116, 118,
149   122, 124, 126, 128, 130, 132, 134, 136,
150   138, 140, 143, 145, 148, 151, 154, 157
151 };
152 
153 static const uint16_t kAcTable[128] = {
154   4,     5,   6,   7,   8,   9,  10,  11,
155   12,   13,  14,  15,  16,  17,  18,  19,
156   20,   21,  22,  23,  24,  25,  26,  27,
157   28,   29,  30,  31,  32,  33,  34,  35,
158   36,   37,  38,  39,  40,  41,  42,  43,
159   44,   45,  46,  47,  48,  49,  50,  51,
160   52,   53,  54,  55,  56,  57,  58,  60,
161   62,   64,  66,  68,  70,  72,  74,  76,
162   78,   80,  82,  84,  86,  88,  90,  92,
163   94,   96,  98, 100, 102, 104, 106, 108,
164   110, 112, 114, 116, 119, 122, 125, 128,
165   131, 134, 137, 140, 143, 146, 149, 152,
166   155, 158, 161, 164, 167, 170, 173, 177,
167   181, 185, 189, 193, 197, 201, 205, 209,
168   213, 217, 221, 225, 229, 234, 239, 245,
169   249, 254, 259, 264, 269, 274, 279, 284
170 };
171 
172 static const uint16_t kAcTable2[128] = {
173   8,     8,   9,  10,  12,  13,  15,  17,
174   18,   20,  21,  23,  24,  26,  27,  29,
175   31,   32,  34,  35,  37,  38,  40,  41,
176   43,   44,  46,  48,  49,  51,  52,  54,
177   55,   57,  58,  60,  62,  63,  65,  66,
178   68,   69,  71,  72,  74,  75,  77,  79,
179   80,   82,  83,  85,  86,  88,  89,  93,
180   96,   99, 102, 105, 108, 111, 114, 117,
181   120, 124, 127, 130, 133, 136, 139, 142,
182   145, 148, 151, 155, 158, 161, 164, 167,
183   170, 173, 176, 179, 184, 189, 193, 198,
184   203, 207, 212, 217, 221, 226, 230, 235,
185   240, 244, 249, 254, 258, 263, 268, 274,
186   280, 286, 292, 299, 305, 311, 317, 323,
187   330, 336, 342, 348, 354, 362, 370, 379,
188   385, 393, 401, 409, 416, 424, 432, 440
189 };
190 
191 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
192   { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194 
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199   0,  30, 60, 90,
200   30, 60, 90, 90,
201   60, 90, 90, 90,
202   90, 90, 90, 90
203 };
204 
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207 
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210   int i, sum;
211   for (i = 0; i < 2; ++i) {
212     const int is_ac_coeff = (i > 0);
213     const int bias = kBiasMatrices[type][is_ac_coeff];
214     m->iq_[i] = (1 << QFIX) / m->q_[i];
215     m->bias_[i] = BIAS(bias);
216     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217     //   * zero if coeff <= zthresh
218     //   * non-zero if coeff > zthresh
219     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220   }
221   for (i = 2; i < 16; ++i) {
222     m->q_[i] = m->q_[1];
223     m->iq_[i] = m->iq_[1];
224     m->bias_[i] = m->bias_[1];
225     m->zthresh_[i] = m->zthresh_[1];
226   }
227   for (sum = 0, i = 0; i < 16; ++i) {
228     if (type == 0) {  // we only use sharpening for AC luma coeffs
229       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230     } else {
231       m->sharpen_[i] = 0;
232     }
233     sum += m->q_[i];
234   }
235   return (sum + 8) >> 4;
236 }
237 
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239 
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241   int i;
242   const int tlambda_scale =
243     (enc->method_ >= 4) ? enc->config_->sns_strength
244                         : 0;
245   const int num_segments = enc->segment_hdr_.num_segments_;
246   for (i = 0; i < num_segments; ++i) {
247     VP8SegmentInfo* const m = &enc->dqm_[i];
248     const int q = m->quant_;
249     int q_i4, q_i16, q_uv;
250     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
252 
253     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255 
256     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258 
259     q_i4  = ExpandMatrix(&m->y1_, 0);
260     q_i16 = ExpandMatrix(&m->y2_, 1);
261     q_uv  = ExpandMatrix(&m->uv_, 2);
262 
263     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
264     m->lambda_i16_         = (3 * q_i16 * q_i16);
265     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
266     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
267     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
268     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
270     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
271 
272     // none of these constants should be < 1
273     CheckLambdaValue(&m->lambda_i4_);
274     CheckLambdaValue(&m->lambda_i16_);
275     CheckLambdaValue(&m->lambda_uv_);
276     CheckLambdaValue(&m->lambda_mode_);
277     CheckLambdaValue(&m->lambda_trellis_i4_);
278     CheckLambdaValue(&m->lambda_trellis_i16_);
279     CheckLambdaValue(&m->lambda_trellis_uv_);
280     CheckLambdaValue(&m->tlambda_);
281 
282     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
283     m->max_edge_  = 0;
284 
285     m->i4_penalty_ = 1000 * q_i4 * q_i4;
286   }
287 }
288 
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291 
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295 
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297   int i;
298   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299   const int level0 = 5 * enc->config_->filter_strength;
300   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301     VP8SegmentInfo* const m = &enc->dqm_[i];
302     // We focus on the quantization of AC coeffs.
303     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304     const int base_strength =
305         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306     // Segments with lower complexity ('beta') will be less filtered.
307     const int f = base_strength * level0 / (256 + m->beta_);
308     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309   }
310   // We record the initial strength (mainly for the case of 1-segment only).
311   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315 
316 //------------------------------------------------------------------------------
317 
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322 
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328   // The file size roughly scales as pow(quantizer, 3.). Actually, the
329   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330   // in the mid-quant range. So we scale the compressibility inversely to
331   // this power-law: quant ~= compression ^ 1/3. This law holds well for
332   // low quant. Finer modeling for high-quant would make use of kAcTable[]
333   // more explicitly.
334   const double v = pow(linear_c, 1 / 3.);
335   return v;
336 }
337 
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339   // We map the complexity 'alpha' and quality setting 'c' to a compression
340   // exponent empirically matched to the compression curve of libjpeg6b.
341   // On average, the WebP output size will be roughly similar to that of a
342   // JPEG file compressed with same quality factor.
343   const double amin = 0.30;
344   const double amax = 0.85;
345   const double exp_min = 0.4;
346   const double exp_max = 0.9;
347   const double slope = (exp_min - exp_max) / (amax - amin);
348   // Linearly interpolate 'expn' from exp_min to exp_max
349   // in the [amin, amax] range.
350   const double expn = (alpha > amax) ? exp_min
351                     : (alpha < amin) ? exp_max
352                     : exp_max + slope * (alpha - amin);
353   const double v = pow(c, expn);
354   return v;
355 }
356 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358                                  const VP8SegmentInfo* const S2) {
359   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361 
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365   // explicit check is needed to avoid a spurious warning about 'i' exceeding
366   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368                                ? enc->segment_hdr_.num_segments_
369                                : NUM_MB_SEGMENTS;
370   int num_final_segments = 1;
371   int s1, s2;
372   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
373     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374     int found = 0;
375     // check if we already have similar segment
376     for (s2 = 0; s2 < num_final_segments; ++s2) {
377       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378       if (SegmentsAreEquivalent(S1, S2)) {
379         found = 1;
380         break;
381       }
382     }
383     map[s1] = s2;
384     if (!found) {
385       if (num_final_segments != s1) {
386         enc->dqm_[num_final_segments] = enc->dqm_[s1];
387       }
388       ++num_final_segments;
389     }
390   }
391   if (num_final_segments < num_segments) {  // Remap
392     int i = enc->mb_w_ * enc->mb_h_;
393     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394     enc->segment_hdr_.num_segments_ = num_final_segments;
395     // Replicate the trailing segment infos (it's mostly cosmetics)
396     for (i = num_final_segments; i < num_segments; ++i) {
397       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398     }
399   }
400 }
401 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403   int i;
404   int dq_uv_ac, dq_uv_dc;
405   const int num_segments = enc->segment_hdr_.num_segments_;
406   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407   const double Q = quality / 100.;
408   const double c_base = enc->config_->emulate_jpeg_size ?
409       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410       QualityToCompression(Q);
411   for (i = 0; i < num_segments; ++i) {
412     // We modulate the base coefficient to accommodate for the quantization
413     // susceptibility and allow denser segments to be quantized more.
414     const double expn = 1. - amp * enc->dqm_[i].alpha_;
415     const double c = pow(c_base, expn);
416     const int q = (int)(127. * (1. - c));
417     assert(expn > 0.);
418     enc->dqm_[i].quant_ = clip(q, 0, 127);
419   }
420 
421   // purely indicative in the bitstream (except for the 1-segment case)
422   enc->base_quant_ = enc->dqm_[0].quant_;
423 
424   // fill-in values for the unused segments (required by the syntax)
425   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426     enc->dqm_[i].quant_ = enc->base_quant_;
427   }
428 
429   // uv_alpha_ is normally spread around ~60. The useful range is
430   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433                                           / (MAX_ALPHA - MIN_ALPHA);
434   // we rescale by the user-defined strength of adaptation
435   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436   // and make it safe.
437   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438   // We also boost the dc-uv-quant a little, based on sns-strength, since
439   // U/V channels are quite more reactive to high quants (flat DC-blocks
440   // tend to appear, and are unpleasant).
441   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
443 
444   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
445   enc->dq_y2_dc_ = 0;
446   enc->dq_y2_ac_ = 0;
447   enc->dq_uv_dc_ = dq_uv_dc;
448   enc->dq_uv_ac_ = dq_uv_ac;
449 
450   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
451 
452   if (num_segments > 1) SimplifySegments(enc);
453 
454   SetupMatrices(enc);         // finalize quantization matrices
455 }
456 
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459 
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463 
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 static const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468 
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472   VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474 
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478   VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480 
481 // Form all the ten Intra4x4 predictions in the yuv_p_ cache
482 // for the 4x4 block it->i4_
MakeIntra4Preds(const VP8EncIterator * const it)483 static void MakeIntra4Preds(const VP8EncIterator* const it) {
484   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
485 }
486 
487 //------------------------------------------------------------------------------
488 // Quantize
489 
490 // Layout:
491 // +----+----+
492 // |YYYY|UUVV| 0
493 // |YYYY|UUVV| 4
494 // |YYYY|....| 8
495 // |YYYY|....| 12
496 // +----+----+
497 
498 const uint16_t VP8Scan[16] = {  // Luma
499   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
500   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
501   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
502   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
503 };
504 
505 static const uint16_t VP8ScanUV[4 + 4] = {
506   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
507   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
508 };
509 
510 //------------------------------------------------------------------------------
511 // Distortion measurement
512 
513 static const uint16_t kWeightY[16] = {
514   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
515 };
516 
517 static const uint16_t kWeightTrellis[16] = {
518 #if USE_TDISTO == 0
519   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
520 #else
521   30, 27, 19, 11,
522   27, 24, 17, 10,
523   19, 17, 12,  8,
524   11, 10,  8,  6
525 #endif
526 };
527 
528 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)529 static void InitScore(VP8ModeScore* const rd) {
530   rd->D  = 0;
531   rd->SD = 0;
532   rd->R  = 0;
533   rd->H  = 0;
534   rd->nz = 0;
535   rd->score = MAX_COST;
536 }
537 
CopyScore(VP8ModeScore * WEBP_RESTRICT const dst,const VP8ModeScore * WEBP_RESTRICT const src)538 static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
539                       const VP8ModeScore* WEBP_RESTRICT const src) {
540   dst->D  = src->D;
541   dst->SD = src->SD;
542   dst->R  = src->R;
543   dst->H  = src->H;
544   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
545   dst->score = src->score;
546 }
547 
AddScore(VP8ModeScore * WEBP_RESTRICT const dst,const VP8ModeScore * WEBP_RESTRICT const src)548 static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
549                      const VP8ModeScore* WEBP_RESTRICT const src) {
550   dst->D  += src->D;
551   dst->SD += src->SD;
552   dst->R  += src->R;
553   dst->H  += src->H;
554   dst->nz |= src->nz;     // here, new nz bits are accumulated.
555   dst->score += src->score;
556 }
557 
558 //------------------------------------------------------------------------------
559 // Performs trellis-optimized quantization.
560 
561 // Trellis node
562 typedef struct {
563   int8_t prev;            // best previous node
564   int8_t sign;            // sign of coeff_i
565   int16_t level;          // level
566 } Node;
567 
568 // Score state
569 typedef struct {
570   score_t score;          // partial RD score
571   const uint16_t* costs;  // shortcut to cost tables
572 } ScoreState;
573 
574 // If a coefficient was quantized to a value Q (using a neutral bias),
575 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
576 // We don't test negative values though.
577 #define MIN_DELTA 0   // how much lower level to try
578 #define MAX_DELTA 1   // how much higher
579 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
580 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
581 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
582 
SetRDScore(int lambda,VP8ModeScore * const rd)583 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
584   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
585 }
586 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)587 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
588                                           score_t distortion) {
589   return rate * lambda + RD_DISTO_MULT * distortion;
590 }
591 
592 // Coefficient type.
593 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
594 
TrellisQuantizeBlock(const VP8Encoder * WEBP_RESTRICT const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * WEBP_RESTRICT const mtx,int lambda)595 static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
596                                 int16_t in[16], int16_t out[16],
597                                 int ctx0, int coeff_type,
598                                 const VP8Matrix* WEBP_RESTRICT const mtx,
599                                 int lambda) {
600   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
601   CostArrayPtr const costs =
602       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
603   const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
604   Node nodes[16][NUM_NODES];
605   ScoreState score_states[2][NUM_NODES];
606   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
607   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
608   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
609   score_t best_score;
610   int n, m, p, last;
611 
612   {
613     score_t cost;
614     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
615     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
616 
617     // compute the position of the last interesting coefficient
618     last = first - 1;
619     for (n = 15; n >= first; --n) {
620       const int j = kZigzag[n];
621       const int err = in[j] * in[j];
622       if (err > thresh) {
623         last = n;
624         break;
625       }
626     }
627     // we don't need to go inspect up to n = 16 coeffs. We can just go up
628     // to last + 1 (inclusive) without losing much.
629     if (last < 15) ++last;
630 
631     // compute 'skip' score. This is the max score one can do.
632     cost = VP8BitCost(0, last_proba);
633     best_score = RDScoreTrellis(lambda, cost, 0);
634 
635     // initialize source node.
636     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
637       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
638       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
639       ss_cur[m].costs = costs[first][ctx0];
640     }
641   }
642 
643   // traverse trellis.
644   for (n = first; n <= last; ++n) {
645     const int j = kZigzag[n];
646     const uint32_t Q  = mtx->q_[j];
647     const uint32_t iQ = mtx->iq_[j];
648     const uint32_t B = BIAS(0x00);     // neutral bias
649     // note: it's important to take sign of the _original_ coeff,
650     // so we don't have to consider level < 0 afterward.
651     const int sign = (in[j] < 0);
652     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
653     int level0 = QUANTDIV(coeff0, iQ, B);
654     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
655     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
656     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
657 
658     {   // Swap current and previous score states
659       ScoreState* const tmp = ss_cur;
660       ss_cur = ss_prev;
661       ss_prev = tmp;
662     }
663 
664     // test all alternate level values around level0.
665     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
666       Node* const cur = &NODE(n, m);
667       const int level = level0 + m;
668       const int ctx = (level > 2) ? 2 : level;
669       const int band = VP8EncBands[n + 1];
670       score_t base_score;
671       score_t best_cur_score;
672       int best_prev;
673       score_t cost, score;
674 
675       ss_cur[m].costs = costs[n + 1][ctx];
676       if (level < 0 || level > thresh_level) {
677         ss_cur[m].score = MAX_COST;
678         // Node is dead.
679         continue;
680       }
681 
682       {
683         // Compute delta_error = how much coding this level will
684         // subtract to max_error as distortion.
685         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
686         const int new_error = coeff0 - level * Q;
687         const int delta_error =
688             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
689         base_score = RDScoreTrellis(lambda, 0, delta_error);
690       }
691 
692       // Inspect all possible non-dead predecessors. Retain only the best one.
693       // The base_score is added to all scores so it is only added for the final
694       // value after the loop.
695       cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
696       best_cur_score =
697           ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
698       best_prev = -MIN_DELTA;
699       for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
700         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
701         // eliminated since their score can't be better than the current best.
702         cost = VP8LevelCost(ss_prev[p].costs, level);
703         // Examine node assuming it's a non-terminal one.
704         score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
705         if (score < best_cur_score) {
706           best_cur_score = score;
707           best_prev = p;
708         }
709       }
710       best_cur_score += base_score;
711       // Store best finding in current node.
712       cur->sign = sign;
713       cur->level = level;
714       cur->prev = best_prev;
715       ss_cur[m].score = best_cur_score;
716 
717       // Now, record best terminal node (and thus best entry in the graph).
718       if (level != 0 && best_cur_score < best_score) {
719         const score_t last_pos_cost =
720             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
721         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
722         score = best_cur_score + last_pos_score;
723         if (score < best_score) {
724           best_score = score;
725           best_path[0] = n;                     // best eob position
726           best_path[1] = m;                     // best node index
727           best_path[2] = best_prev;             // best predecessor
728         }
729       }
730     }
731   }
732 
733   // Fresh start
734   // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
735   if (coeff_type == TYPE_I16_AC) {
736     memset(in + 1, 0, 15 * sizeof(*in));
737     memset(out + 1, 0, 15 * sizeof(*out));
738   } else {
739     memset(in, 0, 16 * sizeof(*in));
740     memset(out, 0, 16 * sizeof(*out));
741   }
742   if (best_path[0] == -1) {
743     return 0;  // skip!
744   }
745 
746   {
747     // Unwind the best path.
748     // Note: best-prev on terminal node is not necessarily equal to the
749     // best_prev for non-terminal. So we patch best_path[2] in.
750     int nz = 0;
751     int best_node = best_path[1];
752     n = best_path[0];
753     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
754 
755     for (; n >= first; --n) {
756       const Node* const node = &NODE(n, best_node);
757       const int j = kZigzag[n];
758       out[n] = node->sign ? -node->level : node->level;
759       nz |= node->level;
760       in[j] = out[n] * mtx->q_[j];
761       best_node = node->prev;
762     }
763     return (nz != 0);
764   }
765 }
766 
767 #undef NODE
768 
769 //------------------------------------------------------------------------------
770 // Performs: difference, transform, quantize, back-transform, add
771 // all at once. Output is the reconstructed block in *yuv_out, and the
772 // quantized levels in *levels.
773 
ReconstructIntra16(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,uint8_t * WEBP_RESTRICT const yuv_out,int mode)774 static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
775                               VP8ModeScore* WEBP_RESTRICT const rd,
776                               uint8_t* WEBP_RESTRICT const yuv_out,
777                               int mode) {
778   const VP8Encoder* const enc = it->enc_;
779   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
780   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
781   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
782   int nz = 0;
783   int n;
784   int16_t tmp[16][16], dc_tmp[16];
785 
786   for (n = 0; n < 16; n += 2) {
787     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
788   }
789   VP8FTransformWHT(tmp[0], dc_tmp);
790   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
791 
792   if (DO_TRELLIS_I16 && it->do_trellis_) {
793     int x, y;
794     VP8IteratorNzToBytes(it);
795     for (y = 0, n = 0; y < 4; ++y) {
796       for (x = 0; x < 4; ++x, ++n) {
797         const int ctx = it->top_nz_[x] + it->left_nz_[y];
798         const int non_zero = TrellisQuantizeBlock(
799             enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
800             dqm->lambda_trellis_i16_);
801         it->top_nz_[x] = it->left_nz_[y] = non_zero;
802         rd->y_ac_levels[n][0] = 0;
803         nz |= non_zero << n;
804       }
805     }
806   } else {
807     for (n = 0; n < 16; n += 2) {
808       // Zero-out the first coeff, so that: a) nz is correct below, and
809       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
810       tmp[n][0] = tmp[n + 1][0] = 0;
811       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
812       assert(rd->y_ac_levels[n + 0][0] == 0);
813       assert(rd->y_ac_levels[n + 1][0] == 0);
814     }
815   }
816 
817   // Transform back
818   VP8TransformWHT(dc_tmp, tmp[0]);
819   for (n = 0; n < 16; n += 2) {
820     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
821   }
822 
823   return nz;
824 }
825 
ReconstructIntra4(VP8EncIterator * WEBP_RESTRICT const it,int16_t levels[16],const uint8_t * WEBP_RESTRICT const src,uint8_t * WEBP_RESTRICT const yuv_out,int mode)826 static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
827                              int16_t levels[16],
828                              const uint8_t* WEBP_RESTRICT const src,
829                              uint8_t* WEBP_RESTRICT const yuv_out,
830                              int mode) {
831   const VP8Encoder* const enc = it->enc_;
832   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
833   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
834   int nz = 0;
835   int16_t tmp[16];
836 
837   VP8FTransform(src, ref, tmp);
838   if (DO_TRELLIS_I4 && it->do_trellis_) {
839     const int x = it->i4_ & 3, y = it->i4_ >> 2;
840     const int ctx = it->top_nz_[x] + it->left_nz_[y];
841     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
842                               dqm->lambda_trellis_i4_);
843   } else {
844     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
845   }
846   VP8ITransform(ref, tmp, yuv_out, 0);
847   return nz;
848 }
849 
850 //------------------------------------------------------------------------------
851 // DC-error diffusion
852 
853 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
854 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
855 #define C1 7    // fraction of error sent to the 4x4 block below
856 #define C2 8    // fraction of error sent to the 4x4 block on the right
857 #define DSHIFT 4
858 #define DSCALE 1   // storage descaling, needed to make the error fit int8_t
859 
860 // Quantize as usual, but also compute and return the quantization error.
861 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * WEBP_RESTRICT const v,const VP8Matrix * WEBP_RESTRICT const mtx)862 static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
863                           const VP8Matrix* WEBP_RESTRICT const mtx) {
864   int V = *v;
865   const int sign = (V < 0);
866   if (sign) V = -V;
867   if (V > (int)mtx->zthresh_[0]) {
868     const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
869     const int err = (V - qV);
870     *v = sign ? -qV : qV;
871     return (sign ? -err : err) >> DSCALE;
872   }
873   *v = 0;
874   return (sign ? -V : V) >> DSCALE;
875 }
876 
CorrectDCValues(const VP8EncIterator * WEBP_RESTRICT const it,const VP8Matrix * WEBP_RESTRICT const mtx,int16_t tmp[][16],VP8ModeScore * WEBP_RESTRICT const rd)877 static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
878                             const VP8Matrix* WEBP_RESTRICT const mtx,
879                             int16_t tmp[][16],
880                             VP8ModeScore* WEBP_RESTRICT const rd) {
881   //         | top[0] | top[1]
882   // --------+--------+---------
883   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
884   // left[1] | tmp[2]   tmp[3]        err2 err3
885   //
886   // Final errors {err1,err2,err3} are preserved and later restored
887   // as top[]/left[] on the next block.
888   int ch;
889   for (ch = 0; ch <= 1; ++ch) {
890     const int8_t* const top = it->top_derr_[it->x_][ch];
891     const int8_t* const left = it->left_derr_[ch];
892     int16_t (* const c)[16] = &tmp[ch * 4];
893     int err0, err1, err2, err3;
894     c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
895     err0 = QuantizeSingle(&c[0][0], mtx);
896     c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
897     err1 = QuantizeSingle(&c[1][0], mtx);
898     c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
899     err2 = QuantizeSingle(&c[2][0], mtx);
900     c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
901     err3 = QuantizeSingle(&c[3][0], mtx);
902     // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
903     // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
904     assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
905     rd->derr[ch][0] = (int8_t)err1;
906     rd->derr[ch][1] = (int8_t)err2;
907     rd->derr[ch][2] = (int8_t)err3;
908   }
909 }
910 
StoreDiffusionErrors(VP8EncIterator * WEBP_RESTRICT const it,const VP8ModeScore * WEBP_RESTRICT const rd)911 static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
912                                  const VP8ModeScore* WEBP_RESTRICT const rd) {
913   int ch;
914   for (ch = 0; ch <= 1; ++ch) {
915     int8_t* const top = it->top_derr_[it->x_][ch];
916     int8_t* const left = it->left_derr_[ch];
917     left[0] = rd->derr[ch][0];            // restore err1
918     left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
919     top[0]  = rd->derr[ch][1];            //     ... err2
920     top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
921   }
922 }
923 
924 #undef C1
925 #undef C2
926 #undef DSHIFT
927 #undef DSCALE
928 
929 //------------------------------------------------------------------------------
930 
ReconstructUV(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,uint8_t * WEBP_RESTRICT const yuv_out,int mode)931 static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
932                          VP8ModeScore* WEBP_RESTRICT const rd,
933                          uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
934   const VP8Encoder* const enc = it->enc_;
935   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
936   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
937   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
938   int nz = 0;
939   int n;
940   int16_t tmp[8][16];
941 
942   for (n = 0; n < 8; n += 2) {
943     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
944   }
945   if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
946 
947   if (DO_TRELLIS_UV && it->do_trellis_) {
948     int ch, x, y;
949     for (ch = 0, n = 0; ch <= 2; ch += 2) {
950       for (y = 0; y < 2; ++y) {
951         for (x = 0; x < 2; ++x, ++n) {
952           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
953           const int non_zero = TrellisQuantizeBlock(
954               enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
955               dqm->lambda_trellis_uv_);
956           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
957           nz |= non_zero << n;
958         }
959       }
960     }
961   } else {
962     for (n = 0; n < 8; n += 2) {
963       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
964     }
965   }
966 
967   for (n = 0; n < 8; n += 2) {
968     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
969   }
970   return (nz << 16);
971 }
972 
973 //------------------------------------------------------------------------------
974 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
975 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
976 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])977 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
978   // We look at the first three AC coefficients to determine what is the average
979   // delta between each sub-4x4 block.
980   const int v0 = abs(DCs[1]);
981   const int v1 = abs(DCs[2]);
982   const int v2 = abs(DCs[4]);
983   int max_v = (v1 > v0) ? v1 : v0;
984   max_v = (v2 > max_v) ? v2 : max_v;
985   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
986 }
987 
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)988 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
989   VP8ModeScore* const tmp = *a;
990   *a = *b;
991   *b = tmp;
992 }
993 
SwapPtr(uint8_t ** a,uint8_t ** b)994 static void SwapPtr(uint8_t** a, uint8_t** b) {
995   uint8_t* const tmp = *a;
996   *a = *b;
997   *b = tmp;
998 }
999 
SwapOut(VP8EncIterator * const it)1000 static void SwapOut(VP8EncIterator* const it) {
1001   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1002 }
1003 
PickBestIntra16(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT rd)1004 static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1005                             VP8ModeScore* WEBP_RESTRICT rd) {
1006   const int kNumBlocks = 16;
1007   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1008   const int lambda = dqm->lambda_i16_;
1009   const int tlambda = dqm->tlambda_;
1010   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1011   VP8ModeScore rd_tmp;
1012   VP8ModeScore* rd_cur = &rd_tmp;
1013   VP8ModeScore* rd_best = rd;
1014   int mode;
1015   int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1016 
1017   rd->mode_i16 = -1;
1018   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1019     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
1020     rd_cur->mode_i16 = mode;
1021 
1022     // Reconstruct
1023     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1024 
1025     // Measure RD-score
1026     rd_cur->D = VP8SSE16x16(src, tmp_dst);
1027     rd_cur->SD =
1028         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1029     rd_cur->H = VP8FixedCostsI16[mode];
1030     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1031     if (is_flat) {
1032       // refine the first impression (which was in pixel space)
1033       is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1034       if (is_flat) {
1035         // Block is very flat. We put emphasis on the distortion being very low!
1036         rd_cur->D *= 2;
1037         rd_cur->SD *= 2;
1038       }
1039     }
1040 
1041     // Since we always examine Intra16 first, we can overwrite *rd directly.
1042     SetRDScore(lambda, rd_cur);
1043     if (mode == 0 || rd_cur->score < rd_best->score) {
1044       SwapModeScore(&rd_cur, &rd_best);
1045       SwapOut(it);
1046     }
1047   }
1048   if (rd_best != rd) {
1049     memcpy(rd, rd_best, sizeof(*rd));
1050   }
1051   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1052   VP8SetIntra16Mode(it, rd->mode_i16);
1053 
1054   // we have a blocky macroblock (only DCs are non-zero) with fairly high
1055   // distortion, record max delta so we can later adjust the minimal filtering
1056   // strength needed to smooth these blocks out.
1057   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1058     StoreMaxDelta(dqm, rd->y_dc_levels);
1059   }
1060 }
1061 
1062 //------------------------------------------------------------------------------
1063 
1064 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * WEBP_RESTRICT const it,const uint8_t modes[16])1065 static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1066                                      const uint8_t modes[16]) {
1067   const int preds_w = it->enc_->preds_w_;
1068   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1069   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1070   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1071   return VP8FixedCostsI4[top][left];
1072 }
1073 
PickBestIntra4(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1074 static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1075                           VP8ModeScore* WEBP_RESTRICT const rd) {
1076   const VP8Encoder* const enc = it->enc_;
1077   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1078   const int lambda = dqm->lambda_i4_;
1079   const int tlambda = dqm->tlambda_;
1080   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1081   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1082   int total_header_bits = 0;
1083   VP8ModeScore rd_best;
1084 
1085   if (enc->max_i4_header_bits_ == 0) {
1086     return 0;
1087   }
1088 
1089   InitScore(&rd_best);
1090   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1091   SetRDScore(dqm->lambda_mode_, &rd_best);
1092   VP8IteratorStartI4(it);
1093   do {
1094     const int kNumBlocks = 1;
1095     VP8ModeScore rd_i4;
1096     int mode;
1097     int best_mode = -1;
1098     const uint8_t* const src = src0 + VP8Scan[it->i4_];
1099     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1100     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1101     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1102 
1103     InitScore(&rd_i4);
1104     MakeIntra4Preds(it);
1105     for (mode = 0; mode < NUM_BMODES; ++mode) {
1106       VP8ModeScore rd_tmp;
1107       int16_t tmp_levels[16];
1108 
1109       // Reconstruct
1110       rd_tmp.nz =
1111           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1112 
1113       // Compute RD-score
1114       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1115       rd_tmp.SD =
1116           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1117                   : 0;
1118       rd_tmp.H = mode_costs[mode];
1119 
1120       // Add flatness penalty, to avoid flat area to be mispredicted
1121       // by a complex mode.
1122       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1123         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1124       } else {
1125         rd_tmp.R = 0;
1126       }
1127 
1128       // early-out check
1129       SetRDScore(lambda, &rd_tmp);
1130       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1131 
1132       // finish computing score
1133       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1134       SetRDScore(lambda, &rd_tmp);
1135 
1136       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1137         CopyScore(&rd_i4, &rd_tmp);
1138         best_mode = mode;
1139         SwapPtr(&tmp_dst, &best_block);
1140         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1141                sizeof(rd_best.y_ac_levels[it->i4_]));
1142       }
1143     }
1144     SetRDScore(dqm->lambda_mode_, &rd_i4);
1145     AddScore(&rd_best, &rd_i4);
1146     if (rd_best.score >= rd->score) {
1147       return 0;
1148     }
1149     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1150     if (total_header_bits > enc->max_i4_header_bits_) {
1151       return 0;
1152     }
1153     // Copy selected samples if not in the right place already.
1154     if (best_block != best_blocks + VP8Scan[it->i4_]) {
1155       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1156     }
1157     rd->modes_i4[it->i4_] = best_mode;
1158     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1159   } while (VP8IteratorRotateI4(it, best_blocks));
1160 
1161   // finalize state
1162   CopyScore(rd, &rd_best);
1163   VP8SetIntra4Mode(it, rd->modes_i4);
1164   SwapOut(it);
1165   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1166   return 1;   // select intra4x4 over intra16x16
1167 }
1168 
1169 //------------------------------------------------------------------------------
1170 
PickBestUV(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1171 static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1172                        VP8ModeScore* WEBP_RESTRICT const rd) {
1173   const int kNumBlocks = 8;
1174   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1175   const int lambda = dqm->lambda_uv_;
1176   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1177   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1178   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1179   uint8_t* dst = dst0;
1180   VP8ModeScore rd_best;
1181   int mode;
1182 
1183   rd->mode_uv = -1;
1184   InitScore(&rd_best);
1185   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1186     VP8ModeScore rd_uv;
1187 
1188     // Reconstruct
1189     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1190 
1191     // Compute RD-score
1192     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1193     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1194     rd_uv.H  = VP8FixedCostsUV[mode];
1195     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1196     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1197       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1198     }
1199 
1200     SetRDScore(lambda, &rd_uv);
1201     if (mode == 0 || rd_uv.score < rd_best.score) {
1202       CopyScore(&rd_best, &rd_uv);
1203       rd->mode_uv = mode;
1204       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1205       if (it->top_derr_ != NULL) {
1206         memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1207       }
1208       SwapPtr(&dst, &tmp_dst);
1209     }
1210   }
1211   VP8SetIntraUVMode(it, rd->mode_uv);
1212   AddScore(rd, &rd_best);
1213   if (dst != dst0) {   // copy 16x8 block if needed
1214     VP8Copy16x8(dst, dst0);
1215   }
1216   if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1217     StoreDiffusionErrors(it, rd);
1218   }
1219 }
1220 
1221 //------------------------------------------------------------------------------
1222 // Final reconstruction and quantization.
1223 
SimpleQuantize(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd)1224 static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1225                            VP8ModeScore* WEBP_RESTRICT const rd) {
1226   const VP8Encoder* const enc = it->enc_;
1227   const int is_i16 = (it->mb_->type_ == 1);
1228   int nz = 0;
1229 
1230   if (is_i16) {
1231     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1232   } else {
1233     VP8IteratorStartI4(it);
1234     do {
1235       const int mode =
1236           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1237       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1238       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1239       MakeIntra4Preds(it);
1240       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1241                               src, dst, mode) << it->i4_;
1242     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1243   }
1244 
1245   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1246   rd->nz = nz;
1247 }
1248 
1249 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * WEBP_RESTRICT const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * WEBP_RESTRICT const rd)1250 static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1251                                   int try_both_modes, int refine_uv_mode,
1252                                   VP8ModeScore* WEBP_RESTRICT const rd) {
1253   score_t best_score = MAX_COST;
1254   int nz = 0;
1255   int mode;
1256   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1257 
1258   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1259   // Some empiric constants, of approximate order of magnitude.
1260   const int lambda_d_i16 = 106;
1261   const int lambda_d_i4 = 11;
1262   const int lambda_d_uv = 120;
1263   score_t score_i4 = dqm->i4_penalty_;
1264   score_t i4_bit_sum = 0;
1265   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1266                                            : MAX_COST;  // no early-out allowed
1267 
1268   if (is_i16) {   // First, evaluate Intra16 distortion
1269     int best_mode = -1;
1270     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1271     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1272       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1273       const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1274                           + VP8FixedCostsI16[mode] * lambda_d_i16;
1275       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1276         continue;
1277       }
1278 
1279       if (score < best_score) {
1280         best_mode = mode;
1281         best_score = score;
1282       }
1283     }
1284     if (it->x_ == 0 || it->y_ == 0) {
1285       // avoid starting a checkerboard resonance from the border. See bug #432.
1286       if (IsFlatSource16(src)) {
1287         best_mode = (it->x_ == 0) ? 0 : 2;
1288         try_both_modes = 0;  // stick to i16
1289       }
1290     }
1291     VP8SetIntra16Mode(it, best_mode);
1292     // we'll reconstruct later, if i16 mode actually gets selected
1293   }
1294 
1295   // Next, evaluate Intra4
1296   if (try_both_modes || !is_i16) {
1297     // We don't evaluate the rate here, but just account for it through a
1298     // constant penalty (i4 mode usually needs more bits compared to i16).
1299     is_i16 = 0;
1300     VP8IteratorStartI4(it);
1301     do {
1302       int best_i4_mode = -1;
1303       score_t best_i4_score = MAX_COST;
1304       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1305       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1306 
1307       MakeIntra4Preds(it);
1308       for (mode = 0; mode < NUM_BMODES; ++mode) {
1309         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1310         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1311                             + mode_costs[mode] * lambda_d_i4;
1312         if (score < best_i4_score) {
1313           best_i4_mode = mode;
1314           best_i4_score = score;
1315         }
1316       }
1317       i4_bit_sum += mode_costs[best_i4_mode];
1318       rd->modes_i4[it->i4_] = best_i4_mode;
1319       score_i4 += best_i4_score;
1320       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1321         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1322         is_i16 = 1;
1323         break;
1324       } else {  // reconstruct partial block inside yuv_out2_ buffer
1325         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1326         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1327                                 src, tmp_dst, best_i4_mode) << it->i4_;
1328       }
1329     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1330   }
1331 
1332   // Final reconstruction, depending on which mode is selected.
1333   if (!is_i16) {
1334     VP8SetIntra4Mode(it, rd->modes_i4);
1335     SwapOut(it);
1336     best_score = score_i4;
1337   } else {
1338     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1339   }
1340 
1341   // ... and UV!
1342   if (refine_uv_mode) {
1343     int best_mode = -1;
1344     score_t best_uv_score = MAX_COST;
1345     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1346     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1347       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1348       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1349                           + VP8FixedCostsUV[mode] * lambda_d_uv;
1350       if (score < best_uv_score) {
1351         best_mode = mode;
1352         best_uv_score = score;
1353       }
1354     }
1355     VP8SetIntraUVMode(it, best_mode);
1356   }
1357   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1358 
1359   rd->nz = nz;
1360   rd->score = best_score;
1361 }
1362 
1363 //------------------------------------------------------------------------------
1364 // Entry point
1365 
VP8Decimate(VP8EncIterator * WEBP_RESTRICT const it,VP8ModeScore * WEBP_RESTRICT const rd,VP8RDLevel rd_opt)1366 int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1367                 VP8ModeScore* WEBP_RESTRICT const rd,
1368                 VP8RDLevel rd_opt) {
1369   int is_skipped;
1370   const int method = it->enc_->method_;
1371 
1372   InitScore(rd);
1373 
1374   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1375   // Luma4x4 predictions needs to be done as-we-go.
1376   VP8MakeLuma16Preds(it);
1377   VP8MakeChroma8Preds(it);
1378 
1379   if (rd_opt > RD_OPT_NONE) {
1380     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1381     PickBestIntra16(it, rd);
1382     if (method >= 2) {
1383       PickBestIntra4(it, rd);
1384     }
1385     PickBestUV(it, rd);
1386     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1387       it->do_trellis_ = 1;
1388       SimpleQuantize(it, rd);
1389     }
1390   } else {
1391     // At this point we have heuristically decided intra16 / intra4.
1392     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1393     // For method <= 1, we don't re-examine the decision but just go ahead with
1394     // quantization/reconstruction.
1395     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1396   }
1397   is_skipped = (rd->nz == 0);
1398   VP8SetSkip(it, is_skipped);
1399   return is_skipped;
1400 }
1401